scraper.rb 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. require 'set'
  2. require 'html/pipeline'
  3. module Docs
  4. class Scraper < Doc
  5. class << self
  6. attr_accessor :base_url, :root_path, :html_filters, :text_filters, :options
  7. def inherited(subclass)
  8. super
  9. subclass.class_eval do
  10. extend AutoloadHelper
  11. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  12. end
  13. subclass.root_path = root_path
  14. subclass.options = options.deep_dup
  15. subclass.html_filters = html_filters.inheritable_copy
  16. subclass.text_filters = text_filters.inheritable_copy
  17. end
  18. def filters
  19. html_filters.to_a + text_filters.to_a
  20. end
  21. end
  22. include Instrumentable
  23. self.html_filters = FilterStack.new
  24. self.text_filters = FilterStack.new
  25. self.options = {}
  26. html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
  27. text_filters.push 'inner_html', 'clean_text', 'attribution'
  28. def base_url
  29. @base_url ||= URL.parse self.class.base_url
  30. end
  31. def root_url
  32. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  33. end
  34. def root_path
  35. self.class.root_path
  36. end
  37. def root_path?
  38. root_path.present? && root_path != '/'
  39. end
  40. def build_page(path)
  41. response = request_one url_for(path)
  42. result = handle_response(response)
  43. yield result if block_given?
  44. result
  45. end
  46. def build_pages
  47. requested_urls = Set.new [root_url.to_s.downcase]
  48. instrument 'running.scraper', urls: requested_urls.to_a
  49. request_all root_url.to_s do |response|
  50. next unless data = handle_response(response)
  51. yield data
  52. next unless data[:internal_urls].present?
  53. next_urls = data[:internal_urls].select { |url| requested_urls.add?(url.downcase) }
  54. instrument 'queued.scraper', urls: next_urls
  55. next_urls
  56. end
  57. end
  58. def options
  59. @options ||= self.class.options.deep_dup.tap do |options|
  60. options.merge! base_url: base_url, root_path: root_path, root_url: root_url
  61. (options[:skip] ||= []).concat ['', '/'] if root_path?
  62. if options[:only] || options[:only_patterns]
  63. (options[:only] ||= []).concat root_path? ? [root_path] : ['', '/']
  64. end
  65. options.freeze
  66. end
  67. end
  68. def pipeline
  69. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  70. pipeline.instrumentation_service = Docs
  71. end
  72. end
  73. private
  74. def request_one(url)
  75. raise NotImplementedError
  76. end
  77. def request_all(url, &block)
  78. raise NotImplementedError
  79. end
  80. def process_response?(response)
  81. raise NotImplementedError
  82. end
  83. def url_for(path)
  84. if path.empty? || path == '/'
  85. root_url.to_s
  86. else
  87. File.join(base_url.to_s, path)
  88. end
  89. end
  90. def handle_response(response)
  91. if process_response?(response)
  92. instrument 'process_response.scraper', response: response do
  93. process_response(response)
  94. end
  95. else
  96. instrument 'ignore_response.scraper', response: response
  97. end
  98. end
  99. def process_response(response)
  100. pipeline.call parse(response.body), pipeline_context(response), data = {}
  101. data
  102. end
  103. def pipeline_context(response)
  104. options.merge url: response.url
  105. end
  106. def parse(string)
  107. Parser.new(string).html
  108. end
  109. end
  110. end