1
0

scraper.rb 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. require 'set'
  2. module Docs
  3. class Scraper < Doc
  4. class << self
  5. attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
  6. def inherited(subclass)
  7. super
  8. subclass.class_eval do
  9. extend AutoloadHelper
  10. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  11. end
  12. subclass.base_url = base_url
  13. subclass.root_path = root_path
  14. subclass.initial_paths = initial_paths.dup
  15. subclass.options = options.deep_dup
  16. subclass.html_filters = html_filters.inheritable_copy
  17. subclass.text_filters = text_filters.inheritable_copy
  18. subclass.stubs = stubs.dup
  19. end
  20. def filters
  21. html_filters.to_a + text_filters.to_a
  22. end
  23. def stub(path, &block)
  24. @stubs[path] = block
  25. @stubs
  26. end
  27. end
  28. include Instrumentable
  29. self.initial_paths = []
  30. self.options = {}
  31. self.stubs = {}
  32. self.html_filters = FilterStack.new
  33. self.text_filters = FilterStack.new
  34. html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
  35. text_filters.push 'inner_html', 'clean_text', 'attribution'
  36. def initialize
  37. super
  38. initialize_stubs
  39. end
  40. def initialize_stubs
  41. self.class.stubs.each do |path, block|
  42. Typhoeus.stub(url_for(path)).and_return do
  43. Typhoeus::Response.new \
  44. effective_url: url_for(path),
  45. code: 200,
  46. headers: { 'Content-Type' => 'text/html' },
  47. body: self.instance_exec(&block)
  48. end
  49. end
  50. end
  51. def build_page(path)
  52. response = request_one url_for(path)
  53. result = handle_response(response)
  54. yield result if block_given?
  55. result
  56. end
  57. def build_pages
  58. history = Set.new initial_urls.map(&:downcase)
  59. instrument 'running.scraper', urls: initial_urls
  60. request_all initial_urls do |response|
  61. next unless data = handle_response(response)
  62. yield data
  63. next unless data[:internal_urls].present?
  64. next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
  65. instrument 'queued.scraper', urls: next_urls
  66. next_urls
  67. end
  68. end
  69. def base_url
  70. @base_url ||= URL.parse self.class.base_url
  71. end
  72. def root_url
  73. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  74. end
  75. def root_path
  76. self.class.root_path
  77. end
  78. def root_path?
  79. root_path.present? && root_path != '/'
  80. end
  81. def initial_paths
  82. self.class.initial_paths
  83. end
  84. def initial_urls
  85. @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
  86. end
  87. def pipeline
  88. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  89. pipeline.instrumentation_service = Docs
  90. end
  91. end
  92. def options
  93. @options ||= self.class.options.deep_dup.tap do |options|
  94. options.merge! base_url: base_url, root_url: root_url,
  95. root_path: root_path, initial_paths: initial_paths,
  96. version: self.class.version
  97. if root_path?
  98. (options[:skip] ||= []).concat ['', '/']
  99. end
  100. if options[:only] || options[:only_patterns]
  101. (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
  102. end
  103. options.merge!(additional_options) if respond_to?(:additional_options, true)
  104. options.freeze
  105. end
  106. end
  107. private
  108. def request_one(url)
  109. raise NotImplementedError
  110. end
  111. def request_all(url, &block)
  112. raise NotImplementedError
  113. end
  114. def process_response?(response)
  115. raise NotImplementedError
  116. end
  117. def url_for(path)
  118. if path.empty? || path == '/'
  119. root_url.to_s
  120. else
  121. File.join(base_url.to_s, path)
  122. end
  123. end
  124. def handle_response(response)
  125. if process_response?(response)
  126. instrument 'process_response.scraper', response: response do
  127. process_response(response)
  128. end
  129. else
  130. instrument 'ignore_response.scraper', response: response
  131. end
  132. rescue => e
  133. if Docs.rescue_errors
  134. instrument 'error.doc', exception: e, url: response.url
  135. nil
  136. else
  137. raise e
  138. end
  139. end
  140. def process_response(response)
  141. data = {}
  142. html, title = parse(response.body)
  143. context = pipeline_context(response)
  144. context[:html_title] = title
  145. pipeline.call(html, context, data)
  146. data
  147. end
  148. def pipeline_context(response)
  149. options.merge url: response.url
  150. end
  151. def parse(string)
  152. parser = Parser.new(string)
  153. [parser.html, parser.title]
  154. end
  155. def with_filters(*filters)
  156. stack = FilterStack.new
  157. stack.push(*filters)
  158. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  159. yield
  160. ensure
  161. @pipeline = nil
  162. end
  163. module FixInternalUrlsBehavior
  164. def self.included(base)
  165. base.extend ClassMethods
  166. end
  167. module ClassMethods
  168. attr_reader :internal_urls
  169. def store_pages(store)
  170. instrument 'info.doc', msg: 'Building internal urls...'
  171. with_internal_urls do
  172. instrument 'info.doc', msg: 'Building pages...'
  173. super
  174. end
  175. end
  176. private
  177. def with_internal_urls
  178. @internal_urls = new.fetch_internal_urls
  179. yield
  180. ensure
  181. @internal_urls = nil
  182. end
  183. end
  184. def fetch_internal_urls
  185. result = []
  186. build_pages do |page|
  187. result << base_url.subpath_to(page[:response_url]) if page[:entries].present?
  188. end
  189. result
  190. end
  191. def initial_urls
  192. return super unless self.class.internal_urls
  193. @initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
  194. end
  195. private
  196. def additional_options
  197. if self.class.internal_urls
  198. {
  199. only: self.class.internal_urls.to_set,
  200. only_patterns: nil,
  201. skip: nil,
  202. skip_patterns: nil,
  203. skip_links: nil,
  204. fixed_internal_urls: true
  205. }
  206. else
  207. {}
  208. end
  209. end
  210. def process_response(response)
  211. super.merge! response_url: response.url
  212. end
  213. end
  214. end
  215. end