1
0

url_scraper.rb 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. def inherited(subclass)
  7. super
  8. subclass.params = params.deep_dup
  9. subclass.headers = headers.deep_dup
  10. end
  11. end
  12. self.params = {}
  13. self.headers = { 'User-Agent' => 'DevDocs' }
  14. private
  15. def request_one(url)
  16. Request.run url, request_options
  17. end
  18. def request_all(urls, &block)
  19. Requester.run urls, request_options: request_options, &block
  20. end
  21. def request_options
  22. { params: self.class.params, headers: self.class.headers }
  23. end
  24. def process_response?(response)
  25. if response.error?
  26. raise "Error status code (#{response.code}): #{response.url}"
  27. end
  28. response.success? && response.html? && process_url?(response.effective_url)
  29. end
  30. def process_url?(url)
  31. base_url.contains?(url)
  32. end
  33. def load_capybara_selenium
  34. require 'capybara/dsl'
  35. Capybara.register_driver :selenium_marionette do |app|
  36. Capybara::Selenium::Driver.new(app, marionette: true)
  37. end
  38. Capybara.current_driver = :selenium_marionette
  39. Capybara.run_server = false
  40. Capybara
  41. end
  42. module MultipleBaseUrls
  43. def self.included(base)
  44. base.extend ClassMethods
  45. end
  46. module ClassMethods
  47. attr_reader :base_urls
  48. def base_urls=(urls)
  49. self.base_url = urls.first
  50. @base_urls = urls
  51. end
  52. end
  53. def initial_urls
  54. super + self.class.base_urls[1..-1].deep_dup
  55. end
  56. def base_urls
  57. @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
  58. end
  59. private
  60. def process_url?(url)
  61. base_urls.any? { |base_url| base_url.contains?(url) }
  62. end
  63. def process_response(response)
  64. original_scheme = self.base_url.scheme
  65. original_host = self.base_url.host
  66. original_path = self.base_url.path
  67. effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
  68. self.base_url.scheme = effective_base_url.scheme
  69. self.base_url.host = effective_base_url.host
  70. self.base_url.path = effective_base_url.path
  71. super
  72. ensure
  73. self.base_url.scheme = original_scheme
  74. self.base_url.host = original_host
  75. self.base_url.path = original_path
  76. end
  77. end
  78. module FixRedirectionsBehavior
  79. def self.included(base)
  80. base.extend ClassMethods
  81. end
  82. module ClassMethods
  83. attr_reader :redirections
  84. def store_pages(store)
  85. instrument 'info.doc', msg: 'Fetching redirections...'
  86. with_redirections do
  87. instrument 'info.doc', msg: 'Building pages...'
  88. super
  89. end
  90. end
  91. private
  92. def with_redirections
  93. @redirections = new.fetch_redirections
  94. yield
  95. ensure
  96. @redirections = nil
  97. end
  98. end
  99. def fetch_redirections
  100. result = {}
  101. with_filters 'container', 'normalize_urls', 'internal_urls' do
  102. build_pages do |page|
  103. next if page[:response_effective_path] == page[:response_path]
  104. result[page[:response_path].downcase] = page[:response_effective_path]
  105. end
  106. end
  107. result
  108. end
  109. private
  110. def process_response(response)
  111. super.merge! response_effective_path: response.effective_path, response_path: response.path
  112. end
  113. def additional_options
  114. { redirections: self.class.redirections }
  115. end
  116. end
  117. end
  118. end