Parcourir la source

Implement initial_paths scraper option

Thibaut il y a 12 ans
Parent
commit
f8298181e9
2 fichiers modifiés avec 79 ajouts et 20 suppressions
  1. 19 8
      lib/docs/core/scraper.rb
  2. 60 12
      test/lib/docs/core/scraper_test.rb

+ 19 - 8
lib/docs/core/scraper.rb

@@ -4,7 +4,7 @@ require 'html/pipeline'
 module Docs
   class Scraper < Doc
     class << self
-      attr_accessor :base_url, :root_path, :html_filters, :text_filters, :options
+      attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters
 
       def inherited(subclass)
         super
@@ -15,6 +15,7 @@ module Docs
         end
 
         subclass.root_path = root_path
+        subclass.initial_paths = initial_paths.dup
         subclass.options = options.deep_dup
         subclass.html_filters = html_filters.inheritable_copy
         subclass.text_filters = text_filters.inheritable_copy
@@ -27,11 +28,12 @@ module Docs
 
     include Instrumentable
 
+    self.initial_paths = []
+    self.options = {}
+
     self.html_filters = FilterStack.new
     self.text_filters = FilterStack.new
 
-    self.options = {}
-
     html_filters.push 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths'
     text_filters.push 'inner_html', 'clean_text', 'attribution'
 
@@ -43,10 +45,10 @@ module Docs
     end
 
     def build_pages
-      requested_urls = Set.new [root_url.to_s.downcase]
-      instrument 'running.scraper', urls: requested_urls.to_a
+      requested_urls = Set.new initial_urls.map(&:downcase)
+      instrument 'running.scraper', urls: initial_urls
 
-      request_all root_url.to_s do |response|
+      request_all initial_urls do |response|
         next unless data = handle_response(response)
         yield data
         next unless data[:internal_urls].present?
@@ -72,6 +74,14 @@ module Docs
       root_path.present? && root_path != '/'
     end
 
+    def initial_paths
+      self.class.initial_paths
+    end
+
+    def initial_urls
+      @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
+    end
+
     def pipeline
       @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
         pipeline.instrumentation_service = Docs
@@ -80,14 +90,15 @@ module Docs
 
     def options
       @options ||= self.class.options.deep_dup.tap do |options|
-        options.merge! base_url: base_url, root_url: root_url, root_path: root_path
+        options.merge! base_url: base_url, root_url: root_url,
+                       root_path: root_path, initial_paths: initial_paths
 
         if root_path?
           (options[:skip] ||= []).concat ['', '/']
         end
 
         if options[:only] || options[:only_patterns]
-          (options[:only] ||= []).concat root_path? ? [root_path] : ['', '/']
+          (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
         end
 
         options.freeze

+ 60 - 12
test/lib/docs/core/scraper_test.rb

@@ -6,6 +6,7 @@ class DocsScraperTest < MiniTest::Spec
     self.type = 'scraper'
     self.base_url = 'http://example.com/'
     self.root_path = '/root'
+    self.initial_paths = ['/initial']
     self.html_filters = Docs::FilterStack.new
     self.text_filters = Docs::FilterStack.new
   end
@@ -37,6 +38,12 @@ class DocsScraperTest < MiniTest::Spec
       assert_equal Scraper.root_path, subclass.root_path
     end
 
+    it "duplicates .initial_paths" do
+      stub(Scraper).initial_paths { ['path'] }
+      assert_equal Scraper.initial_paths, subclass.initial_paths
+      refute_same Scraper.initial_paths, subclass.initial_paths
+    end
+
     it "duplicates .options" do
       stub(Scraper).options { { test: [] } }
       assert_equal Scraper.options, subclass.options
@@ -112,6 +119,29 @@ class DocsScraperTest < MiniTest::Spec
     end
   end
 
+  describe "#initial_urls" do
+    let :initial_urls do
+      scraper.initial_urls
+    end
+
+    it "returns a frozen, memoized Array" do
+      assert_instance_of Array, initial_urls
+      assert initial_urls.frozen?
+      assert_same initial_urls, scraper.initial_urls
+    end
+
+    it "includes the #root_url" do
+      assert_includes initial_urls, scraper.root_url.to_s
+    end
+
+    it "includes the .initial_paths converted to urls" do
+      stub(Scraper).base_url { 'http://example.com/' }
+      stub(Scraper).initial_paths { ['one', '/two'] }
+      assert_includes initial_urls, 'http://example.com/one'
+      assert_includes initial_urls, 'http://example.com/two'
+    end
+  end
+
   describe "#build_page" do
     before do
       stub(scraper).handle_response
@@ -171,8 +201,8 @@ class DocsScraperTest < MiniTest::Spec
       Proc.new {}
     end
 
-    it "requests the root url" do
-      mock(scraper).request_all(scraper.root_url.to_s)
+    it "requests the #initial_urls" do
+      mock(scraper).request_all(scraper.initial_urls)
       scraper.build_pages(&block)
     end
 
@@ -181,12 +211,14 @@ class DocsScraperTest < MiniTest::Spec
       scraper.build_pages(&block)
       assert scraper.last_instrumentation
       assert_equal 'running.scraper', scraper.last_instrumentation[:event]
-      assert_includes scraper.last_instrumentation[:payload][:urls], scraper.root_url.to_s
+      assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
     end
 
     context "when the response is processable" do
       before do
-        stub(scraper).request_all.with_any_args { |*args| @returned = args.last.call(response) }
+        stub(scraper).request_all do |urls, block|
+          urls.each { |url| @next_urls ||= block.call(response) }
+        end
         stub(scraper).handle_response(response) { processed_response }
       end
 
@@ -203,7 +235,7 @@ class DocsScraperTest < MiniTest::Spec
 
         it "requests nothing more" do
           scraper.build_pages(&block)
-          assert_nil @returned
+          assert_nil @next_urls
         end
 
         it "doesn't instrument 'queued'" do
@@ -223,14 +255,14 @@ class DocsScraperTest < MiniTest::Spec
 
         it "requests the urls" do
           scraper.build_pages(&block)
-          assert_equal urls, @returned
+          assert_equal urls, @next_urls
         end
 
         it "doesn't request the same url twice irrespective of case" do
           stub(Scraper).root_path { 'PATH' }
-          processed_response[:internal_urls] = [scraper.root_url.to_s.swapcase]
+          processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
           scraper.build_pages(&block)
-          assert_empty @returned
+          assert_empty @next_urls
         end
 
         it "instruments 'queued'" do
@@ -280,6 +312,10 @@ class DocsScraperTest < MiniTest::Spec
       assert_equal '/root', options[:root_path]
     end
 
+    it "includes .initial_paths" do
+      assert_equal ['/initial'], options[:initial_paths]
+    end
+
     context "when #root_path? is false" do
       before do
         stub(scraper).root_path? { false }
@@ -300,15 +336,27 @@ class DocsScraperTest < MiniTest::Spec
           assert_includes options[:only], '/'
         end
 
+        it "adds .initial_paths" do
+          assert_includes options[:only], '/initial'
+        end
+
         it "doesn't modify the array in place" do
           assert_equal ['/path'], Scraper.options[:only]
         end
       end
 
       context "and :only_patterns is an array" do
-        it "assigns ['', '/'] to :only" do
+        before do
           stub(Scraper).options { { only_patterns: [] } }
-          assert_equal ['', '/'], options[:only]
+        end
+
+        it "adds ['', '/'] to :only" do
+          assert_includes options[:only], ''
+          assert_includes options[:only], '/'
+        end
+
+        it "adds .initial_paths to :only" do
+          assert_includes options[:only], '/initial'
         end
       end
     end
@@ -348,9 +396,9 @@ class DocsScraperTest < MiniTest::Spec
       end
 
       context "and :only_patterns is an array" do
-        it "assigns [.root_path] to :only" do
+        it "adds .root_path to :only" do
           stub(Scraper).options { { only_patterns: [] } }
-          assert_equal ['/root'], options[:only]
+          assert_includes options[:only], '/root'
         end
       end
     end