url_scraper.rb 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. def inherited(subclass)
  7. super
  8. subclass.params = params.deep_dup
  9. subclass.headers = headers.deep_dup
  10. end
  11. end
  12. self.params = {}
  13. self.headers = { 'User-Agent' => 'DevDocs' }
  14. private
  15. def request_one(url)
  16. Request.run url, request_options
  17. end
  18. def request_all(urls, &block)
  19. Requester.run urls, request_options: request_options, &block
  20. end
  21. def request_options
  22. { params: self.class.params, headers: self.class.headers }
  23. end
  24. def process_response?(response)
  25. if response.error?
  26. raise "Error status code (#{response.code}): #{response.url}"
  27. end
  28. response.success? && response.html? && base_url.contains?(response.effective_url)
  29. end
  30. def load_capybara_selenium
  31. require 'capybara/dsl'
  32. Capybara.register_driver :selenium_marionette do |app|
  33. Capybara::Selenium::Driver.new(app, marionette: true)
  34. end
  35. Capybara.current_driver = :selenium_marionette
  36. Capybara.run_server = false
  37. Capybara
  38. end
  39. module FixRedirectionsBehavior
  40. def self.included(base)
  41. base.extend ClassMethods
  42. end
  43. module ClassMethods
  44. attr_reader :redirections
  45. def store_pages(store)
  46. instrument 'info.doc', msg: 'Fetching redirections...'
  47. with_redirections do
  48. instrument 'info.doc', msg: 'Building pages...'
  49. super
  50. end
  51. end
  52. private
  53. def with_redirections
  54. @redirections = new.fetch_redirections
  55. yield
  56. ensure
  57. @redirections = nil
  58. end
  59. end
  60. def fetch_redirections
  61. result = {}
  62. with_filters 'container', 'normalize_urls', 'internal_urls' do
  63. build_pages do |page|
  64. next if page[:response_effective_path] == page[:response_path]
  65. result[page[:response_path].downcase] = page[:response_effective_path]
  66. end
  67. end
  68. result
  69. end
  70. private
  71. def process_response(response)
  72. super.merge! response_effective_path: response.effective_path, response_path: response.path
  73. end
  74. def additional_options
  75. { redirections: self.class.redirections }
  76. end
  77. end
  78. end
  79. end