url_scraper.rb 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. attr_accessor :force_gzip
  7. def inherited(subclass)
  8. super
  9. subclass.params = params.deep_dup
  10. subclass.headers = headers.deep_dup
  11. subclass.force_gzip = force_gzip
  12. end
  13. end
  14. self.params = {}
  15. self.headers = { 'User-Agent' => 'DevDocs' }
  16. self.force_gzip = false
  17. private
  18. def request_one(url)
  19. Request.run url, request_options
  20. end
  21. def request_all(urls, &block)
  22. Requester.run urls, request_options: request_options, &block
  23. end
  24. def request_options
  25. options = { params: self.class.params, headers: self.class.headers }
  26. options[:accept_encoding] = 'gzip' if self.class.force_gzip
  27. options
  28. end
  29. def process_response?(response)
  30. if response.error?
  31. raise "Error status code (#{response.code}): #{response.return_message}\n#{response.url}"
  32. elsif response.blank?
  33. raise "Empty response body: #{response.url}"
  34. end
  35. response.success? && response.html? && process_url?(response.effective_url)
  36. end
  37. def process_url?(url)
  38. base_url.contains?(url)
  39. end
  40. def load_capybara_selenium
  41. require 'capybara/dsl'
  42. require 'selenium/webdriver'
  43. Capybara.register_driver :chrome do |app|
  44. options = Selenium::WebDriver::Chrome::Options.new(args: %w[headless disable-gpu])
  45. Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
  46. end
  47. Capybara.javascript_driver = :chrome
  48. Capybara.current_driver = :chrome
  49. Capybara.run_server = false
  50. Capybara
  51. end
  52. module MultipleBaseUrls
  53. def self.included(base)
  54. base.extend ClassMethods
  55. end
  56. module ClassMethods
  57. attr_reader :base_urls
  58. def base_urls=(urls)
  59. self.base_url = urls.first
  60. @base_urls = urls
  61. end
  62. end
  63. def initial_urls
  64. super + self.class.base_urls[1..-1].deep_dup
  65. end
  66. def base_urls
  67. @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
  68. end
  69. private
  70. def process_url?(url)
  71. base_urls.any? { |base_url| base_url.contains?(url) }
  72. end
  73. def process_response(response)
  74. original_scheme = self.base_url.scheme
  75. original_host = self.base_url.host
  76. original_path = self.base_url.path
  77. effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
  78. self.base_url.scheme = effective_base_url.scheme
  79. self.base_url.host = effective_base_url.host
  80. self.base_url.path = effective_base_url.path
  81. super
  82. ensure
  83. self.base_url.scheme = original_scheme
  84. self.base_url.host = original_host
  85. self.base_url.path = original_path
  86. end
  87. end
  88. module FixRedirectionsBehavior
  89. def self.included(base)
  90. base.extend ClassMethods
  91. end
  92. def self.prepended(base)
  93. class << base
  94. prepend ClassMethods
  95. end
  96. end
  97. module ClassMethods
  98. def redirections
  99. @redirections
  100. end
  101. def store_pages(store)
  102. instrument 'info.doc', msg: 'Fetching redirections...'
  103. with_redirections do
  104. instrument 'info.doc', msg: 'Continuing...'
  105. super
  106. end
  107. end
  108. private
  109. def with_redirections
  110. @redirections = new.fetch_redirections
  111. yield
  112. ensure
  113. @redirections = nil
  114. end
  115. end
  116. def fetch_redirections
  117. result = {}
  118. with_filters 'apply_base_url', 'container', 'normalize_urls', 'internal_urls' do
  119. build_pages do |page|
  120. next if page[:response_effective_path] == page[:response_path]
  121. result[page[:response_path].downcase] = page[:response_effective_path]
  122. end
  123. end
  124. result
  125. end
  126. private
  127. def process_response(response)
  128. super.merge! response_effective_path: response.effective_path, response_path: response.path
  129. end
  130. def additional_options
  131. super.merge! redirections: self.class.redirections
  132. end
  133. end
  134. end
  135. end