scraper.rb 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. require 'set'
  2. module Docs
  3. class Scraper < Doc
  4. class << self
  5. attr_accessor :base_url, :root_path, :initial_paths, :initial_urls, :options, :html_filters, :text_filters, :stubs
  6. def inherited(subclass)
  7. super
  8. subclass.class_eval do
  9. extend AutoloadHelper
  10. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  11. end
  12. subclass.base_url = base_url
  13. subclass.root_path = root_path
  14. subclass.initial_paths = initial_paths.dup
  15. subclass.initial_urls = initial_urls.dup
  16. subclass.options = options.deep_dup
  17. subclass.html_filters = html_filters.inheritable_copy
  18. subclass.text_filters = text_filters.inheritable_copy
  19. subclass.stubs = stubs.dup
  20. end
  21. def filters
  22. html_filters.to_a + text_filters.to_a
  23. end
  24. def stub(path, &block)
  25. @stubs[path] = block
  26. @stubs
  27. end
  28. end
  29. include Instrumentable
  30. self.initial_paths = []
  31. self.initial_urls = []
  32. self.options = {}
  33. self.stubs = {}
  34. self.html_filters = FilterStack.new
  35. self.text_filters = FilterStack.new
  36. html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
  37. text_filters.push 'inner_html', 'clean_text', 'attribution'
  38. def initialize
  39. super
  40. initialize_stubs
  41. end
  42. def initialize_stubs
  43. self.class.stubs.each do |path, block|
  44. Typhoeus.stub(url_for(path)).and_return do
  45. Typhoeus::Response.new \
  46. effective_url: url_for(path),
  47. code: 200,
  48. headers: { 'Content-Type' => 'text/html' },
  49. body: self.instance_exec(&block)
  50. end
  51. end
  52. end
  53. def build_page(path)
  54. response = request_one url_for(path)
  55. result = handle_response(response)
  56. yield result if block_given?
  57. result
  58. end
  59. def build_pages
  60. history = Set.new initial_urls.map(&:downcase)
  61. instrument 'running.scraper', urls: initial_urls
  62. request_all initial_urls do |response|
  63. next unless data = handle_response(response)
  64. yield data
  65. next unless data[:internal_urls].present?
  66. next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
  67. instrument 'queued.scraper', urls: next_urls
  68. next_urls
  69. end
  70. end
  71. def base_url
  72. @base_url ||= URL.parse self.class.base_url
  73. end
  74. def root_url
  75. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  76. end
  77. def root_path
  78. self.class.root_path
  79. end
  80. def root_path?
  81. root_path.present? && root_path != '/'
  82. end
  83. def initial_paths
  84. self.class.initial_paths
  85. end
  86. def initial_urls
  87. @initial_urls ||= [root_url.to_s].concat(self.class.initial_urls).concat(initial_paths.map(&method(:url_for))).freeze
  88. end
  89. def pipeline
  90. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  91. pipeline.instrumentation_service = Docs
  92. end
  93. end
  94. def options
  95. @options ||= self.class.options.deep_dup.tap do |options|
  96. options.merge! base_url: base_url, root_url: root_url,
  97. root_path: root_path, initial_paths: initial_paths
  98. if root_path?
  99. (options[:skip] ||= []).concat ['', '/']
  100. end
  101. if options[:only] || options[:only_patterns]
  102. (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
  103. end
  104. options.merge!(additional_options) if respond_to?(:additional_options, true)
  105. options.freeze
  106. end
  107. end
  108. private
  109. def request_one(url)
  110. raise NotImplementedError
  111. end
  112. def request_all(url, &block)
  113. raise NotImplementedError
  114. end
  115. def process_response?(response)
  116. raise NotImplementedError
  117. end
  118. def url_for(path)
  119. if path.empty? || path == '/'
  120. root_url.to_s
  121. else
  122. File.join(base_url.to_s, path)
  123. end
  124. end
  125. def handle_response(response)
  126. if process_response?(response)
  127. instrument 'process_response.scraper', response: response do
  128. process_response(response)
  129. end
  130. else
  131. instrument 'ignore_response.scraper', response: response
  132. end
  133. rescue => e
  134. puts "URL: #{response.url}"
  135. raise e
  136. end
  137. def process_response(response)
  138. data = {}
  139. pipeline.call(parse(response.body), pipeline_context(response), data)
  140. data
  141. end
  142. def pipeline_context(response)
  143. options.merge url: response.url
  144. end
  145. def parse(string)
  146. Parser.new(string).html
  147. end
  148. def with_filters(*filters)
  149. stack = FilterStack.new
  150. stack.push(*filters)
  151. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  152. yield
  153. ensure
  154. @pipeline = nil
  155. end
  156. module FixInternalUrlsBehavior
  157. def self.included(base)
  158. base.extend ClassMethods
  159. end
  160. module ClassMethods
  161. attr_reader :internal_urls
  162. def store_pages(store)
  163. instrument 'info.doc', msg: 'Building internal urls...'
  164. with_internal_urls do
  165. instrument 'info.doc', msg: 'Building pages...'
  166. super
  167. end
  168. end
  169. private
  170. def with_internal_urls
  171. @internal_urls = new.fetch_internal_urls
  172. yield
  173. ensure
  174. @internal_urls = nil
  175. end
  176. end
  177. def fetch_internal_urls
  178. result = []
  179. build_pages do |page|
  180. result << base_url.subpath_to(page[:response_url]) if page[:entries].present?
  181. end
  182. result
  183. end
  184. def initial_urls
  185. return super unless self.class.internal_urls
  186. @initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
  187. end
  188. private
  189. def additional_options
  190. if self.class.internal_urls
  191. {
  192. only: self.class.internal_urls.to_set,
  193. only_patterns: nil,
  194. skip: nil,
  195. skip_patterns: nil,
  196. skip_links: nil,
  197. fixed_internal_urls: true
  198. }
  199. else
  200. {}
  201. end
  202. end
  203. def process_response(response)
  204. super.merge! response_url: response.url
  205. end
  206. end
  207. end
  208. end