scraper.rb 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. require 'set'
  2. module Docs
  3. class Scraper < Doc
  4. class << self
  5. attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters
  6. def inherited(subclass)
  7. super
  8. subclass.class_eval do
  9. extend AutoloadHelper
  10. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  11. end
  12. subclass.root_path = root_path
  13. subclass.initial_paths = initial_paths.dup
  14. subclass.options = options.deep_dup
  15. subclass.html_filters = html_filters.inheritable_copy
  16. subclass.text_filters = text_filters.inheritable_copy
  17. end
  18. def filters
  19. html_filters.to_a + text_filters.to_a
  20. end
  21. end
  22. include Instrumentable
  23. self.initial_paths = []
  24. self.options = {}
  25. self.html_filters = FilterStack.new
  26. self.text_filters = FilterStack.new
  27. html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
  28. text_filters.push 'inner_html', 'clean_text', 'attribution'
  29. def build_page(path)
  30. response = request_one url_for(path)
  31. result = handle_response(response)
  32. yield result if block_given?
  33. result
  34. end
  35. def build_pages
  36. history = Set.new initial_urls.map(&:downcase)
  37. instrument 'running.scraper', urls: initial_urls
  38. request_all initial_urls do |response|
  39. next unless data = handle_response(response)
  40. yield data
  41. next unless data[:internal_urls].present?
  42. next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
  43. instrument 'queued.scraper', urls: next_urls
  44. next_urls
  45. end
  46. end
  47. def base_url
  48. @base_url ||= URL.parse self.class.base_url
  49. end
  50. def root_url
  51. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  52. end
  53. def root_path
  54. self.class.root_path
  55. end
  56. def root_path?
  57. root_path.present? && root_path != '/'
  58. end
  59. def initial_paths
  60. self.class.initial_paths
  61. end
  62. def initial_urls
  63. @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
  64. end
  65. def pipeline
  66. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  67. pipeline.instrumentation_service = Docs
  68. end
  69. end
  70. def options
  71. @options ||= self.class.options.deep_dup.tap do |options|
  72. options.merge! base_url: base_url, root_url: root_url,
  73. root_path: root_path, initial_paths: initial_paths
  74. if root_path?
  75. (options[:skip] ||= []).concat ['', '/']
  76. end
  77. if options[:only] || options[:only_patterns]
  78. (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
  79. end
  80. options.merge!(additional_options) if respond_to?(:additional_options, true)
  81. options.freeze
  82. end
  83. end
  84. private
  85. def request_one(url)
  86. raise NotImplementedError
  87. end
  88. def request_all(url, &block)
  89. raise NotImplementedError
  90. end
  91. def process_response?(response)
  92. raise NotImplementedError
  93. end
  94. def url_for(path)
  95. if path.empty? || path == '/'
  96. root_url.to_s
  97. else
  98. File.join(base_url.to_s, path)
  99. end
  100. end
  101. def handle_response(response)
  102. if process_response?(response)
  103. instrument 'process_response.scraper', response: response do
  104. process_response(response)
  105. end
  106. else
  107. instrument 'ignore_response.scraper', response: response
  108. end
  109. rescue => e
  110. puts "URL: #{response.url}"
  111. raise e
  112. end
  113. def process_response(response)
  114. data = {}
  115. pipeline.call(parse(response.body), pipeline_context(response), data)
  116. data
  117. end
  118. def pipeline_context(response)
  119. options.merge url: response.url
  120. end
  121. def parse(string)
  122. Parser.new(string).html
  123. end
  124. module StubRootPage
  125. private
  126. def request_one(url)
  127. stub_root_page if url == root_url.to_s
  128. super
  129. end
  130. def request_all(urls, &block)
  131. stub_root_page
  132. super
  133. end
  134. def stub_root_page
  135. response = Typhoeus::Response.new(
  136. effective_url: root_url.to_s,
  137. code: 200,
  138. headers: { 'Content-Type' => 'text/html' },
  139. body: root_page_body)
  140. Typhoeus.stub(root_url.to_s).and_return(response)
  141. end
  142. end
  143. end
  144. end