scraper.rb 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. require 'set'
  2. module Docs
  3. class Scraper < Doc
  4. class << self
  5. attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters
  6. def inherited(subclass)
  7. super
  8. subclass.class_eval do
  9. extend AutoloadHelper
  10. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  11. end
  12. subclass.base_url = base_url
  13. subclass.root_path = root_path
  14. subclass.initial_paths = initial_paths.dup
  15. subclass.options = options.deep_dup
  16. subclass.html_filters = html_filters.inheritable_copy
  17. subclass.text_filters = text_filters.inheritable_copy
  18. end
  19. def filters
  20. html_filters.to_a + text_filters.to_a
  21. end
  22. end
  23. include Instrumentable
  24. self.initial_paths = []
  25. self.options = {}
  26. self.html_filters = FilterStack.new
  27. self.text_filters = FilterStack.new
  28. html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
  29. text_filters.push 'inner_html', 'clean_text', 'attribution'
  30. def build_page(path)
  31. response = request_one url_for(path)
  32. result = handle_response(response)
  33. yield result if block_given?
  34. result
  35. end
  36. def build_pages
  37. history = Set.new initial_urls.map(&:downcase)
  38. instrument 'running.scraper', urls: initial_urls
  39. request_all initial_urls do |response|
  40. next unless data = handle_response(response)
  41. yield data
  42. next unless data[:internal_urls].present?
  43. next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
  44. instrument 'queued.scraper', urls: next_urls
  45. next_urls
  46. end
  47. end
  48. def base_url
  49. @base_url ||= URL.parse self.class.base_url
  50. end
  51. def root_url
  52. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  53. end
  54. def root_path
  55. self.class.root_path
  56. end
  57. def root_path?
  58. root_path.present? && root_path != '/'
  59. end
  60. def initial_paths
  61. self.class.initial_paths
  62. end
  63. def initial_urls
  64. @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
  65. end
  66. def pipeline
  67. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  68. pipeline.instrumentation_service = Docs
  69. end
  70. end
  71. def options
  72. @options ||= self.class.options.deep_dup.tap do |options|
  73. options.merge! base_url: base_url, root_url: root_url,
  74. root_path: root_path, initial_paths: initial_paths
  75. if root_path?
  76. (options[:skip] ||= []).concat ['', '/']
  77. end
  78. if options[:only] || options[:only_patterns]
  79. (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
  80. end
  81. options.merge!(additional_options) if respond_to?(:additional_options, true)
  82. options.freeze
  83. end
  84. end
  85. private
  86. def request_one(url)
  87. raise NotImplementedError
  88. end
  89. def request_all(url, &block)
  90. raise NotImplementedError
  91. end
  92. def process_response?(response)
  93. raise NotImplementedError
  94. end
  95. def url_for(path)
  96. if path.empty? || path == '/'
  97. root_url.to_s
  98. else
  99. File.join(base_url.to_s, path)
  100. end
  101. end
  102. def handle_response(response)
  103. if process_response?(response)
  104. instrument 'process_response.scraper', response: response do
  105. process_response(response)
  106. end
  107. else
  108. instrument 'ignore_response.scraper', response: response
  109. end
  110. rescue => e
  111. puts "URL: #{response.url}"
  112. raise e
  113. end
  114. def process_response(response)
  115. data = {}
  116. pipeline.call(parse(response.body), pipeline_context(response), data)
  117. data
  118. end
  119. def pipeline_context(response)
  120. options.merge url: response.url
  121. end
  122. def parse(string)
  123. Parser.new(string).html
  124. end
  125. def with_filters(*filters)
  126. stack = FilterStack.new
  127. stack.push(*filters)
  128. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  129. yield
  130. ensure
  131. @pipeline = nil
  132. end
  133. module StubRootPage
  134. private
  135. def request_one(url)
  136. stub_root_page if url == root_url.to_s
  137. super
  138. end
  139. def request_all(urls, &block)
  140. stub_root_page
  141. super
  142. end
  143. def stub_root_page
  144. response = Typhoeus::Response.new(
  145. effective_url: root_url.to_s,
  146. code: 200,
  147. headers: { 'Content-Type' => 'text/html' },
  148. body: root_page_body)
  149. Typhoeus.stub(root_url.to_s).and_return(response)
  150. end
  151. end
  152. module FixInternalUrlsBehavior
  153. def self.included(base)
  154. base.extend ClassMethods
  155. end
  156. module ClassMethods
  157. attr_reader :internal_urls
  158. def store_pages(store)
  159. instrument 'info.doc', msg: 'Building internal urls...'
  160. with_internal_urls do
  161. instrument 'info.doc', msg: 'Building pages...'
  162. super
  163. end
  164. end
  165. private
  166. def with_internal_urls
  167. @internal_urls = new.fetch_internal_urls
  168. yield
  169. ensure
  170. @internal_urls = nil
  171. end
  172. end
  173. def fetch_internal_urls
  174. result = []
  175. build_pages do |page|
  176. result << base_url.subpath_to(page[:response_url]) if page[:entries].present?
  177. end
  178. result
  179. end
  180. def initial_urls
  181. return super unless self.class.internal_urls
  182. @initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
  183. end
  184. private
  185. def additional_options
  186. if self.class.internal_urls
  187. {
  188. only: self.class.internal_urls.to_set,
  189. only_patterns: nil,
  190. skip: nil,
  191. skip_patterns: nil,
  192. skip_links: nil,
  193. fixed_internal_urls: true
  194. }
  195. else
  196. {}
  197. end
  198. end
  199. def process_response(response)
  200. super.merge! response_url: response.url
  201. end
  202. end
  203. end
  204. end