Bladeren bron

Refactor scrapers with multiple base URLs

Thibaut Courouble 9 jaren geleden
bovenliggende
commit
46a9ed16f6
4 gewijzigde bestanden met toevoegingen van 53 en 64 verwijderingen
  1. 2 4
      lib/docs/core/scraper.rb
  2. 46 0
      lib/docs/core/scrapers/url_scraper.rb
  3. 2 30
      lib/docs/scrapers/ember.rb
  4. 3 30
      lib/docs/scrapers/meteor.rb

+ 2 - 4
lib/docs/core/scraper.rb

@@ -3,7 +3,7 @@ require 'set'
 module Docs
   class Scraper < Doc
     class << self
-      attr_accessor :base_url, :root_path, :initial_paths, :initial_urls, :options, :html_filters, :text_filters, :stubs
+      attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
 
       def inherited(subclass)
         super
@@ -16,7 +16,6 @@ module Docs
         subclass.base_url = base_url
         subclass.root_path = root_path
         subclass.initial_paths = initial_paths.dup
-        subclass.initial_urls = initial_urls.dup
         subclass.options = options.deep_dup
         subclass.html_filters = html_filters.inheritable_copy
         subclass.text_filters = text_filters.inheritable_copy
@@ -36,7 +35,6 @@ module Docs
     include Instrumentable
 
     self.initial_paths = []
-    self.initial_urls = []
     self.options = {}
     self.stubs = {}
 
@@ -105,7 +103,7 @@ module Docs
     end
 
     def initial_urls
-      @initial_urls ||= [root_url.to_s].concat(self.class.initial_urls).concat(initial_paths.map(&method(:url_for))).freeze
+      @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
     end
 
     def pipeline

+ 46 - 0
lib/docs/core/scrapers/url_scraper.rb

@@ -50,6 +50,52 @@ module Docs
       Capybara
     end
 
+    module MultipleBaseUrls
+      def self.included(base)
+        base.extend ClassMethods
+      end
+
+      module ClassMethods
+        attr_reader :base_urls
+
+        def base_urls=(urls)
+          self.base_url = urls.first
+          @base_urls = urls
+        end
+      end
+
+      def initial_urls
+        super + self.class.base_urls[1..-1]
+      end
+
+      def base_urls
+        @base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
+      end
+
+      private
+
+      def process_url?(url)
+        base_urls.any? { |base_url| base_url.contains?(url) }
+      end
+
+      def process_response(response)
+        original_scheme = self.base_url.scheme
+        original_host = self.base_url.host
+        original_path = self.base_url.path
+
+        effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
+
+        self.base_url.scheme = effective_base_url.scheme
+        self.base_url.host = effective_base_url.host
+        self.base_url.path = effective_base_url.path
+        super
+      ensure
+        self.base_url.scheme = original_scheme
+        self.base_url.host = original_host
+        self.base_url.path = original_path
+      end
+    end
+
     module FixRedirectionsBehavior
       def self.included(base)
         base.extend ClassMethods

+ 2 - 30
lib/docs/scrapers/ember.rb

@@ -1,16 +1,12 @@
 module Docs
   class Ember < UrlScraper
-    class << self
-      attr_accessor :guide_url
-    end
+    include MultipleBaseUrls
 
     self.name = 'Ember.js'
     self.slug = 'ember'
     self.type = 'ember'
     self.release = '2.7.0'
-    self.base_url = 'http://emberjs.com/api/'
-    self.guide_url = "https://guides.emberjs.com/v#{self.release}/"
-    self.initial_urls = [guide_url]
+    self.base_urls = ['http://emberjs.com/api/', "https://guides.emberjs.com/v#{self.release}/"]
     self.links = {
       home: 'http://emberjs.com/',
       code: 'https://github.com/emberjs/ember.js'
@@ -39,29 +35,5 @@ module Docs
       &copy; 2016 Yehuda Katz, Tom Dale and Ember.js contributors<br>
       Licensed under the MIT License.
     HTML
-
-    def guide_url
-      @guide_url ||= URL.parse(self.class.guide_url)
-    end
-
-    private
-
-    def process_url?(url)
-      base_url.contains?(url) || guide_url.contains?(url)
-    end
-
-    def process_response(response)
-      original_scheme = @base_url.scheme
-      original_host = @base_url.host
-      original_path = @base_url.path
-      @base_url.scheme = response.effective_url.scheme
-      @base_url.host = response.effective_url.host
-      @base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/api/'
-      super
-    ensure
-      @base_url.scheme = original_scheme
-      @base_url.host = original_host
-      @base_url.path = original_path
-    end
   end
 end

+ 3 - 30
lib/docs/scrapers/meteor.rb

@@ -1,8 +1,6 @@
 module Docs
   class Meteor < UrlScraper
-    class << self
-      attr_accessor :guide_url
-    end
+    include MultipleBaseUrls
 
     self.type = 'meteor'
     self.root_path = 'index.html'
@@ -28,37 +26,12 @@ module Docs
 
     version '1.4' do
       self.release = '1.4.0'
-      self.base_url = 'https://docs.meteor.com/'
-      self.guide_url = 'https://guide.meteor.com/'
-      self.initial_urls = [guide_url]
+      self.base_urls = ['https://docs.meteor.com/', 'https://guide.meteor.com/']
     end
 
     version '1.3' do
       self.release = '1.3.5'
-      self.base_url = "https://docs.meteor.com/v#{self.release}/"
-      self.guide_url = 'https://guide.meteor.com/v1.3/'
-      self.initial_urls = [guide_url]
-    end
-
-    def guide_url
-      @guide_url ||= URL.parse(self.class.guide_url)
-    end
-
-    private
-
-    def process_url?(url)
-      base_url.contains?(url) || guide_url.contains?(url)
-    end
-
-    def process_response(response)
-      original_host = @base_url.host
-      original_path = @base_url.path
-      @base_url.host = response.effective_url.host
-      @base_url.path = response.effective_url.path[/\A\/v[\d\.]+\//, 0] || '/'
-      super
-    ensure
-      @base_url.host = original_host
-      @base_url.path = original_path
+      self.base_urls = ["https://docs.meteor.com/v#{self.release}/", 'https://guide.meteor.com/v1.3/']
     end
   end
 end