| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- require 'set'
- module Docs
- class Scraper < Doc
- class << self
- attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
- def inherited(subclass)
- super
- subclass.class_eval do
- extend AutoloadHelper
- autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
- end
- subclass.base_url = base_url
- subclass.root_path = root_path
- subclass.initial_paths = initial_paths.dup
- subclass.options = options.deep_dup
- subclass.html_filters = html_filters.inheritable_copy
- subclass.text_filters = text_filters.inheritable_copy
- subclass.stubs = stubs.dup
- end
- def filters
- html_filters.to_a + text_filters.to_a
- end
- def stub(path, &block)
- @stubs[path] = block
- @stubs
- end
- end
- include Instrumentable
- self.initial_paths = []
- self.options = {}
- self.stubs = {}
- self.html_filters = FilterStack.new
- self.text_filters = FilterStack.new
- html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths', 'parse_cf_email'
- text_filters.push 'images' # ensure the images filter runs after all html filters
- text_filters.push 'inner_html', 'clean_text', 'attribution'
- def initialize
- super
- initialize_stubs
- end
- def initialize_stubs
- self.class.stubs.each do |path, &block|
- Typhoeus.stub(url_for(path)).and_return do
- Typhoeus::Response.new \
- effective_url: url_for(path),
- code: 200,
- headers: { 'Content-Type' => 'text/html' },
- body: self.instance_exec(&block)
- end
- end
- end
- def build_page(path)
- response = request_one url_for(path)
- result = handle_response(response)
- yield result if block_given?
- result
- end
- def build_pages
- history = Set.new initial_urls.map(&:downcase)
- instrument 'running.scraper', urls: initial_urls
- request_all initial_urls do |response|
- next unless data = handle_response(response)
- yield data
- next unless data[:internal_urls].present?
- next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
- instrument 'queued.scraper', urls: next_urls
- next_urls
- end
- end
- def base_url
- @base_url ||= URL.parse self.class.base_url
- end
- def root_url
- @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
- end
- def root_path
- self.class.root_path
- end
- def root_path?
- root_path.present? && root_path != '/'
- end
- def initial_paths
- self.class.initial_paths
- end
- def initial_urls
- @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
- end
- def pipeline
- @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
- pipeline.instrumentation_service = Docs
- end
- end
- def options
- @options ||= self.class.options.deep_dup.tap do |options|
- options.merge! base_url: base_url, root_url: root_url,
- root_path: root_path, initial_paths: initial_paths,
- version: self.class.version, release: self.class.release
- if root_path?
- (options[:skip] ||= []).concat ['', '/']
- end
- if options[:only] || options[:only_patterns]
- (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
- end
- options.merge!(additional_options)
- options.freeze
- end
- end
- private
- def request_one(url)
- raise NotImplementedError
- end
- def request_all(url, &block)
- raise NotImplementedError
- end
- def process_response?(response)
- raise NotImplementedError
- end
- def url_for(path)
- if path.empty? || path == '/'
- root_url.to_s
- else
- File.join(base_url.to_s, path.to_s)
- end
- end
- def handle_response(response)
- if process_response?(response)
- instrument 'process_response.scraper', response: response do
- process_response(response)
- end
- else
- instrument 'ignore_response.scraper', response: response
- end
- rescue => e
- if Docs.rescue_errors
- instrument 'error.doc', exception: e, url: response.url
- nil
- else
- raise e
- end
- end
- def process_response(response)
- data = {}
- html, title = parse(response)
- context = pipeline_context(response)
- context[:html_title] = title
- pipeline.call(html, context, data)
- data
- end
- def pipeline_context(response)
- options.merge url: response.url
- end
- def parse(response)
- parser = Parser.new(response.body)
- [parser.html, parser.title]
- end
- def with_filters(*filters)
- stack = FilterStack.new
- stack.push(*filters)
- pipeline.instance_variable_set :@filters, stack.to_a.freeze
- yield
- ensure
- @pipeline = nil
- end
- def additional_options
- {}
- end
- module FixInternalUrlsBehavior
- def self.included(base)
- base.extend ClassMethods
- end
- def self.prepended(base)
- class << base
- prepend ClassMethods
- end
- end
- module ClassMethods
- def internal_urls
- @internal_urls
- end
- def store_pages(store)
- instrument 'info.doc', msg: 'Building internal urls...'
- with_internal_urls do
- instrument 'info.doc', msg: 'Continuing...'
- super
- end
- end
- private
- def with_internal_urls
- @internal_urls = new.fetch_internal_urls
- yield
- ensure
- @internal_urls = nil
- end
- end
- def fetch_internal_urls
- result = []
- build_pages do |page|
- result << page[:subpath] if page[:entries].present?
- end
- result
- end
- def initial_urls
- return super unless self.class.internal_urls
- @initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
- end
- private
- def additional_options
- if self.class.internal_urls
- super.merge! \
- only: self.class.internal_urls.to_set,
- only_patterns: nil,
- skip: nil,
- skip_patterns: nil,
- skip_links: nil,
- fixed_internal_urls: true
- else
- super
- end
- end
- def process_response(response)
- super.merge! response_url: response.url
- end
- end
- end
- end
|