url_scraper.rb 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. def inherited(subclass)
  6. super
  7. subclass.params = params.deep_dup
  8. end
  9. end
  10. self.params = {}
  11. private
  12. def request_one(url)
  13. Request.run url, request_options
  14. end
  15. def request_all(urls, &block)
  16. Requester.run urls, request_options: request_options, &block
  17. end
  18. def request_options
  19. { params: self.class.params }
  20. end
  21. def process_response?(response)
  22. response.success? && response.html? && base_url.contains?(response.effective_url)
  23. end
  24. module FixRedirectionsBehavior
  25. def self.included(base)
  26. base.extend ClassMethods
  27. end
  28. module ClassMethods
  29. attr_accessor :fix_redirections
  30. attr_reader :redirections
  31. def store_pages(store)
  32. return super unless fix_redirections
  33. instrument 'info.doc', msg: 'Fetching redirections...'
  34. with_redirections do
  35. instrument 'info.doc', msg: 'Building pages...'
  36. super
  37. end
  38. end
  39. private
  40. def with_redirections
  41. @redirections = new.fetch_redirections
  42. yield
  43. ensure
  44. @redirections = nil
  45. end
  46. end
  47. def fetch_redirections
  48. result = {}
  49. with_filters 'container', 'normalize_urls', 'internal_urls' do
  50. build_pages do |page|
  51. next if page[:response_effective_path] == page[:response_path]
  52. result[page[:response_path].downcase] = page[:response_effective_path]
  53. end
  54. end
  55. result
  56. end
  57. private
  58. def process_response(response)
  59. super.merge! response_effective_path: response.effective_path, response_path: response.path
  60. end
  61. def additional_options
  62. { redirections: self.class.redirections }
  63. end
  64. def with_filters(*filters)
  65. stack = FilterStack.new
  66. stack.push(*filters)
  67. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  68. yield
  69. ensure
  70. @pipeline = nil
  71. end
  72. end
  73. include FixRedirectionsBehavior
  74. end
  75. end