Просмотр исходного кода

Add two-pass redirection rewriter

... to avoid having to maintain huge lists of redirects. This works by doing a first pass to detect which internal URL is redirected where, before doing a second (normal) pass that rewrites all these URLs (links) with their final destination. There's a bit of monkey-patching I'm not proud of, but this works(tm).
Thibaut 10 лет назад
Родитель
Сommit
018628ea7d

+ 1 - 0
lib/docs/core/scraper.rb

@@ -100,6 +100,7 @@ module Docs
           (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
         end
 
+        options.merge!(additional_options) if respond_to?(:additional_options, true)
         options.freeze
       end
     end

+ 61 - 0
lib/docs/core/scrapers/url_scraper.rb

@@ -28,5 +28,66 @@ module Docs
     def process_response?(response)
       response.success? && response.html? && base_url.contains?(response.effective_url)
     end
+
+    module FixRedirectionsBehavior
+      def self.included(base)
+        base.extend ClassMethods
+      end
+
+      module ClassMethods
+        attr_accessor :fix_redirections
+        attr_reader :redirections
+
+        def store_pages(store)
+          return super unless fix_redirections
+          instrument 'info.doc', msg: 'Fetching redirections...'
+          with_redirections do
+            instrument 'info.doc', msg: 'Building pages...'
+            super
+          end
+        end
+
+        private
+
+        def with_redirections
+          @redirections = new.fetch_redirections
+          yield
+        ensure
+          @redirections = nil
+        end
+      end
+
+      def fetch_redirections
+        result = {}
+        with_filters 'container', 'normalize_urls', 'internal_urls' do
+          build_pages do |page|
+            next if page[:response_effective_path] == page[:response_path]
+            result[page[:response_path].downcase] = page[:response_effective_path]
+          end
+        end
+        result
+      end
+
+      private
+
+      def process_response(response)
+        super.merge! response_effective_path: response.effective_path, response_path: response.path
+      end
+
+      def additional_options
+        { redirections: self.class.redirections }
+      end
+
+      def with_filters(*filters)
+        stack = FilterStack.new
+        stack.push(*filters)
+        pipeline.instance_variable_set :@filters, stack.to_a.freeze
+        yield
+      ensure
+        @pipeline = nil
+      end
+    end
+
+    include FixRedirectionsBehavior
   end
 end

+ 37 - 11
lib/docs/filters/core/normalize_urls.rb

@@ -19,8 +19,12 @@ module Docs
 
     def normalize_url(str)
       url = to_absolute_url(str)
-      fix_url(url)
-      fix_url_string(url.to_s)
+
+      while new_url = fix_url(url)
+        url = new_url
+      end
+
+      url.to_s
     rescue URI::InvalidURIError
       '#'
     end
@@ -31,18 +35,40 @@ module Docs
     end
 
     def fix_url(url)
-      return unless context[:replace_paths]
-      path = subpath_to(url)
+      if context[:redirections]
+        url = URL.parse(url)
+        path = url.path.downcase
 
-      if context[:replace_paths].has_key?(path)
-        url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path]
+        if context[:redirections].key?(path)
+          url.path = context[:redirections][path]
+          return url
+        end
       end
-    end
 
-    def fix_url_string(str)
-      str = context[:replace_urls][str]  || str if context[:replace_urls]
-      str = context[:fix_urls].call(str) || str if context[:fix_urls]
-      str
+      if context[:replace_paths]
+        url = URL.parse(url)
+        path = subpath_to(url)
+
+        if context[:replace_paths].key?(path)
+          url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path]
+          return url
+        end
+      end
+
+      if context[:replace_urls]
+        url = url.to_s
+
+        if context[:replace_urls].key?(url)
+          return context[:replace_urls][url]
+        end
+      end
+
+      if context[:fix_urls]
+        url = url.to_s
+        orig_url = url.dup
+        new_url = context[:fix_urls].call(url)
+        return new_url if new_url != orig_url
+      end
     end
   end
 end

+ 4 - 0
lib/docs/subscribers/doc_subscriber.rb

@@ -16,6 +16,10 @@ module Docs
       log_diff before.keys, after.keys
     end
 
+    def info(event)
+      log event.payload[:msg]
+    end
+
     private
 
     def parse_payload(event)

+ 25 - 1
test/lib/docs/filters/core/normalize_urls_test.rb

@@ -116,7 +116,7 @@ class NormalizeUrlsFilterTest < MiniTest::Spec
     end
 
     it "calls the block with each absolute url" do
-      context[:fix_urls] = ->(arg) { (@args ||= []).push(arg) }
+      context[:fix_urls] = ->(arg) { (@args ||= []).push(arg); nil }
       @body += link_to '/path?#'
       filter.call
       assert_equal ['http://example.com/path?#'] * 2, @args
@@ -139,4 +139,28 @@ class NormalizeUrlsFilterTest < MiniTest::Spec
       refute @called
     end
   end
+
+  context "when context[:redirections] is a hash" do
+    before do
+      @body = link_to 'http://example.com/path?query#frag'
+    end
+
+    it "replaces the path of matching urls, case-insensitive" do
+      @body = link_to('http://example.com/PATH?query#frag') + link_to('http://example.com/path/two')
+      context[:redirections] = { '/path' => '/fixed' }
+      expected = link_to('http://example.com/fixed?query#frag') + link_to('http://example.com/path/two')
+      assert_equal expected, filter_output_string
+    end
+
+    it "does a multi pass with context[:fix_urls]" do
+      @body = link_to('http://example.com/path')
+      context[:fix_urls] = ->(url) do
+        url.sub! 'example.com', 'example.org'
+        url.sub! '/Fixed', '/fixed'
+        url
+      end
+      context[:redirections] = { '/path' => '/Fixed' }
+      assert_equal link_to('http://example.org/fixed'), filter_output_string
+    end
+  end
 end