url_scraper.rb 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. module Docs
  2. class UrlScraper < Scraper
  3. class << self
  4. attr_accessor :params
  5. attr_accessor :headers
  6. def inherited(subclass)
  7. super
  8. subclass.params = params.deep_dup
  9. subclass.headers = headers.deep_dup
  10. end
  11. end
  12. self.params = {}
  13. self.headers = { 'User-Agent' => 'devdocs.io' }
  14. private
  15. def request_one(url)
  16. Request.run url, request_options
  17. end
  18. def request_all(urls, &block)
  19. Requester.run urls, request_options: request_options, &block
  20. end
  21. def request_options
  22. { params: self.class.params, headers: self.class.headers }
  23. end
  24. def process_response?(response)
  25. response.success? && response.html? && base_url.contains?(response.effective_url)
  26. end
  27. module FixRedirectionsBehavior
  28. def self.included(base)
  29. base.extend ClassMethods
  30. end
  31. module ClassMethods
  32. attr_accessor :fix_redirections
  33. attr_reader :redirections
  34. def store_pages(store)
  35. return super unless fix_redirections
  36. instrument 'info.doc', msg: 'Fetching redirections...'
  37. with_redirections do
  38. instrument 'info.doc', msg: 'Building pages...'
  39. super
  40. end
  41. end
  42. private
  43. def with_redirections
  44. @redirections = new.fetch_redirections
  45. yield
  46. ensure
  47. @redirections = nil
  48. end
  49. end
  50. def fetch_redirections
  51. result = {}
  52. with_filters 'container', 'normalize_urls', 'internal_urls' do
  53. build_pages do |page|
  54. next if page[:response_effective_path] == page[:response_path]
  55. result[page[:response_path].downcase] = page[:response_effective_path]
  56. end
  57. end
  58. result
  59. end
  60. private
  61. def process_response(response)
  62. super.merge! response_effective_path: response.effective_path, response_path: response.path
  63. end
  64. def additional_options
  65. { redirections: self.class.redirections }
  66. end
  67. def with_filters(*filters)
  68. stack = FilterStack.new
  69. stack.push(*filters)
  70. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  71. yield
  72. ensure
  73. @pipeline = nil
  74. end
  75. end
  76. include FixRedirectionsBehavior
  77. end
  78. end