url_scraper.rb 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. attr_accessor :force_gzip
  7. def inherited(subclass)
  8. super
  9. subclass.params = params.deep_dup
  10. subclass.headers = headers.deep_dup
  11. subclass.force_gzip = force_gzip
  12. end
  13. end
  14. self.params = {}
  15. self.headers = { 'User-Agent' => 'DevDocs' }
  16. self.force_gzip = false
  17. private
  18. def request_one(url)
  19. Request.run url, request_options
  20. end
  21. def request_all(urls, &block)
  22. Requester.run urls, request_options: request_options, &block
  23. end
  24. def request_options
  25. options = { params: self.class.params, headers: self.class.headers }
  26. options[:accept_encoding] = 'gzip' if self.class.force_gzip
  27. options
  28. end
  29. def process_response?(response)
  30. if response.error?
  31. raise <<~ERROR
  32. Error status code (#{response.code}): #{response.return_message}
  33. #{response.url}
  34. #{JSON.pretty_generate(response.headers).slice(2..-3)}
  35. ERROR
  36. elsif response.blank?
  37. raise "Empty response body: #{response.url}"
  38. end
  39. response.success? && response.html? && process_url?(response.effective_url)
  40. end
  41. def process_url?(url)
  42. base_url.contains?(url)
  43. end
  44. def load_capybara_selenium
  45. require 'capybara/dsl'
  46. require 'selenium/webdriver'
  47. Capybara.register_driver :chrome do |app|
  48. options = Selenium::WebDriver::Chrome::Options.new(args: %w[headless disable-gpu])
  49. Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
  50. end
  51. Capybara.javascript_driver = :chrome
  52. Capybara.current_driver = :chrome
  53. Capybara.run_server = false
  54. Capybara
  55. end
  56. module MultipleBaseUrls
  57. def self.included(base)
  58. base.extend ClassMethods
  59. end
  60. module ClassMethods
  61. attr_reader :base_urls
  62. def base_urls=(urls)
  63. self.base_url = urls.first
  64. @base_urls = urls
  65. end
  66. end
  67. def initial_urls
  68. super + self.class.base_urls[1..-1].deep_dup
  69. end
  70. def base_urls
  71. @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
  72. end
  73. private
  74. def process_url?(url)
  75. base_urls.any? { |base_url| base_url.contains?(url) }
  76. end
  77. def process_response(response)
  78. original_scheme = self.base_url.scheme
  79. original_host = self.base_url.host
  80. original_path = self.base_url.path
  81. effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
  82. self.base_url.scheme = effective_base_url.scheme
  83. self.base_url.host = effective_base_url.host
  84. self.base_url.path = effective_base_url.path
  85. super
  86. ensure
  87. self.base_url.scheme = original_scheme
  88. self.base_url.host = original_host
  89. self.base_url.path = original_path
  90. end
  91. end
  92. module FixRedirectionsBehavior
  93. def self.included(base)
  94. base.extend ClassMethods
  95. end
  96. def self.prepended(base)
  97. class << base
  98. prepend ClassMethods
  99. end
  100. end
  101. module ClassMethods
  102. def redirections
  103. @redirections
  104. end
  105. def store_pages(store)
  106. instrument 'info.doc', msg: 'Fetching redirections...'
  107. with_redirections do
  108. instrument 'info.doc', msg: 'Continuing...'
  109. super
  110. end
  111. end
  112. private
  113. def with_redirections
  114. @redirections = new.fetch_redirections
  115. yield
  116. ensure
  117. @redirections = nil
  118. end
  119. end
  120. def fetch_redirections
  121. result = {}
  122. with_filters 'apply_base_url', 'container', 'normalize_urls', 'internal_urls' do
  123. build_pages do |page|
  124. next if page[:response_effective_path] == page[:response_path]
  125. result[page[:response_path].downcase] = page[:response_effective_path]
  126. end
  127. end
  128. result
  129. end
  130. private
  131. def process_response(response)
  132. super.merge! response_effective_path: response.effective_path, response_path: response.path
  133. end
  134. def additional_options
  135. super.merge! redirections: self.class.redirections
  136. end
  137. end
  138. end
  139. end