url_scraper.rb 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. attr_accessor :force_gzip
  7. def inherited(subclass)
  8. super
  9. subclass.params = params.deep_dup
  10. subclass.headers = headers.deep_dup
  11. subclass.force_gzip = force_gzip
  12. end
  13. end
  14. @@rate_limiter = nil
  15. self.params = {}
  16. self.headers = { 'User-Agent' => 'DevDocs' }
  17. self.force_gzip = false
  18. private
  19. def request_one(url)
  20. Request.run url, request_options
  21. end
  22. def request_all(urls, &block)
  23. if options[:rate_limit]
  24. if @@rate_limiter
  25. @@rate_limiter.limit = options[:rate_limit]
  26. else
  27. @@rate_limiter = RateLimiter.new(options[:rate_limit])
  28. Typhoeus.before(&@@rate_limiter.to_proc)
  29. end
  30. end
  31. Requester.run urls, request_options: request_options, &block
  32. end
  33. def request_options
  34. options = { params: self.class.params, headers: self.class.headers }
  35. options[:accept_encoding] = 'gzip' if self.class.force_gzip
  36. options
  37. end
  38. def process_response?(response)
  39. if response.error?
  40. raise <<~ERROR
  41. Error status code (#{response.code}): #{response.return_message}
  42. #{response.url}
  43. #{JSON.pretty_generate(response.headers).slice(2..-3)}
  44. ERROR
  45. elsif response.blank?
  46. raise "Empty response body: #{response.url}"
  47. end
  48. response.success? && response.html? && process_url?(response.effective_url)
  49. end
  50. def process_url?(url)
  51. base_url.contains?(url)
  52. end
  53. def load_capybara_selenium
  54. require 'capybara/dsl'
  55. require 'selenium/webdriver'
  56. Capybara.register_driver :chrome do |app|
  57. options = Selenium::WebDriver::Chrome::Options.new(args: %w[headless disable-gpu])
  58. Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
  59. end
  60. Capybara.javascript_driver = :chrome
  61. Capybara.current_driver = :chrome
  62. Capybara.run_server = false
  63. Capybara
  64. end
  65. module MultipleBaseUrls
  66. def self.included(base)
  67. base.extend ClassMethods
  68. end
  69. module ClassMethods
  70. attr_reader :base_urls
  71. def base_urls=(urls)
  72. self.base_url = urls.first
  73. @base_urls = urls
  74. end
  75. end
  76. def initial_urls
  77. super + self.class.base_urls[1..-1].deep_dup
  78. end
  79. def base_urls
  80. @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
  81. end
  82. private
  83. def process_url?(url)
  84. base_urls.any? { |base_url| base_url.contains?(url) }
  85. end
  86. def process_response(response)
  87. original_scheme = self.base_url.scheme
  88. original_host = self.base_url.host
  89. original_path = self.base_url.path
  90. effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
  91. self.base_url.scheme = effective_base_url.scheme
  92. self.base_url.host = effective_base_url.host
  93. self.base_url.path = effective_base_url.path
  94. super
  95. ensure
  96. self.base_url.scheme = original_scheme
  97. self.base_url.host = original_host
  98. self.base_url.path = original_path
  99. end
  100. end
  101. module FixRedirectionsBehavior
  102. def self.included(base)
  103. base.extend ClassMethods
  104. end
  105. def self.prepended(base)
  106. class << base
  107. prepend ClassMethods
  108. end
  109. end
  110. module ClassMethods
  111. def redirections
  112. @redirections
  113. end
  114. def store_pages(store)
  115. instrument 'info.doc', msg: 'Fetching redirections...'
  116. with_redirections do
  117. instrument 'info.doc', msg: 'Continuing...'
  118. super
  119. end
  120. end
  121. private
  122. def with_redirections
  123. @redirections = new.fetch_redirections
  124. yield
  125. ensure
  126. @redirections = nil
  127. end
  128. end
  129. def fetch_redirections
  130. result = {}
  131. with_filters 'apply_base_url', 'container', 'normalize_urls', 'internal_urls' do
  132. build_pages do |page|
  133. next if page[:response_effective_path] == page[:response_path]
  134. result[page[:response_path].downcase] = page[:response_effective_path]
  135. end
  136. end
  137. result
  138. end
  139. private
  140. def process_response(response)
  141. super.merge! response_effective_path: response.effective_path, response_path: response.path
  142. end
  143. def additional_options
  144. super.merge! redirections: self.class.redirections
  145. end
  146. end
  147. class RateLimiter
  148. attr_accessor :limit
  149. def initialize(limit)
  150. @limit = limit
  151. @minute = nil
  152. @counter = 0
  153. end
  154. def call(*)
  155. if @minute != Time.now.min
  156. @minute = Time.now.min
  157. @counter = 0
  158. end
  159. @counter += 1
  160. if @counter >= @limit
  161. wait = Time.now.end_of_minute.to_i - Time.now.to_i + 1
  162. sleep wait
  163. end
  164. true
  165. end
  166. def to_proc
  167. method(:call).to_proc
  168. end
  169. end
  170. end
  171. end