Browse Source

Simplify file scraper setup; scrape files in the "docs/[slug]" directory

Thibaut Courouble 7 years ago
parent
commit
bf003669ba

+ 1 - 0
.gitignore

@@ -8,3 +8,4 @@ public/fonts
 public/docs/**/*
 !public/docs/docs.json
 !public/docs/**/index.json
+/docs/

+ 1 - 0
lib/docs.rb

@@ -29,6 +29,7 @@ module Docs
   self.rescue_errors = false
 
   class DocNotFound < NameError; end
+  class SetupError < StandardError; end
 
   def self.all
     Dir["#{root_path}/docs/scrapers/**/*.rb"].

+ 6 - 0
lib/docs/core/doc.rb

@@ -95,6 +95,9 @@ module Docs
             false
           end
         end
+      rescue Docs::SetupError => error
+        puts "ERROR: #{error.message}"
+        false
       end
 
       def store_pages(store)
@@ -118,6 +121,9 @@ module Docs
             false
           end
         end
+      rescue Docs::SetupError => error
+        puts "ERROR: #{error.message}"
+        false
       end
 
       private

+ 18 - 7
lib/docs/core/scrapers/file_scraper.rb

@@ -1,14 +1,13 @@
 module Docs
   class FileScraper < Scraper
+    SOURCE_DIRECTORY = File.expand_path '../../../../../docs', __FILE__
+
     Response = Struct.new :body, :url
 
     class << self
-      attr_accessor :dir
-
       def inherited(subclass)
         super
         subclass.base_url = base_url
-        subclass.dir = dir
       end
     end
 
@@ -16,13 +15,25 @@ module Docs
 
     html_filters.push 'clean_local_urls'
 
+    def source_directory
+      @source_directory ||= File.join(SOURCE_DIRECTORY, self.class.path)
+    end
+
     private
 
+    def assert_source_directory_exists
+      unless Dir.exists?(source_directory)
+        raise SetupError, "The #{self.class.name} scraper requires the original documentation files to be stored in the \"#{source_directory}\" directory."
+      end
+    end
+
     def request_one(url)
-      Response.new read_file(file_path_for(url)), URL.parse(url)
+      assert_source_directory_exists
+      Response.new read_file(url_to_path(url)), URL.parse(url)
     end
 
     def request_all(urls)
+      assert_source_directory_exists
       queue = [urls].flatten
       until queue.empty?
         result = yield request_one(queue.shift)
@@ -34,12 +45,12 @@ module Docs
       response.body.present?
     end
 
-    def file_path_for(url)
-      File.join self.class.dir, url.remove(base_url.to_s)
+    def url_to_path(url)
+      url.remove(base_url.to_s)
     end
 
     def read_file(path)
-      File.read(path)
+      File.read(File.join(source_directory, path))
     rescue
       instrument 'warn.doc', msg: "Failed to open file: #{path}"
       nil

+ 0 - 1
lib/docs/scrapers/c.rb

@@ -1,7 +1,6 @@
 module Docs
   class C < FileScraper
     self.type = 'c'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/c'
     self.base_url = 'http://en.cppreference.com/w/c/'
     self.root_path = 'header.html'
 

+ 0 - 1
lib/docs/scrapers/cpp.rb

@@ -3,7 +3,6 @@ module Docs
     self.name = 'C++'
     self.slug = 'cpp'
     self.type = 'c'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/cpp'
     self.base_url = 'http://en.cppreference.com/w/cpp/'
     self.root_path = 'header.html'
 

+ 0 - 2
lib/docs/scrapers/dart.rb

@@ -24,13 +24,11 @@ module Docs
 
     version '2' do
       self.release = '2.0.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Dart2'
       self.base_url = "https://api.dartlang.org/stable/#{release}/"
     end
 
     version '1' do
       self.release = '1.24.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Dart1'
       self.base_url = "https://api.dartlang.org/stable/#{release}/"
     end
   end

+ 0 - 6
lib/docs/scrapers/django.rb

@@ -36,37 +36,31 @@ module Docs
 
     version '2.1' do
       self.release = '2.1.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django21'
       self.base_url = 'https://docs.djangoproject.com/en/2.1/'
     end
 
     version '2.0' do
       self.release = '2.0.7'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django20'
       self.base_url = 'https://docs.djangoproject.com/en/2.0/'
     end
 
     version '1.11' do
       self.release = '1.11.9'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django111'
       self.base_url = 'https://docs.djangoproject.com/en/1.11/'
     end
 
     version '1.10' do
       self.release = '1.10.8'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django110'
       self.base_url = 'https://docs.djangoproject.com/en/1.10/'
     end
 
     version '1.9' do
       self.release = '1.9.13'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django19'
       self.base_url = 'https://docs.djangoproject.com/en/1.9/'
     end
 
     version '1.8' do
       self.release = '1.8.18'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Django18'
       self.base_url = 'https://docs.djangoproject.com/en/1.8/'
     end
   end

+ 0 - 4
lib/docs/scrapers/erlang.rb

@@ -42,22 +42,18 @@ module Docs
 
     version '21' do
       self.release = '21.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang21'
     end
 
     version '20' do
       self.release = '20.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang20'
     end
 
     version '19' do
       self.release = '19.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang19'
     end
 
     version '18' do
       self.release = '18.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Erlang18'
     end
   end
 end

+ 0 - 8
lib/docs/scrapers/gnu/gcc.rb

@@ -48,13 +48,11 @@ module Docs
 
     version '7' do
       self.release = '7.3.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcc7'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
     end
 
     version '7 CPP' do
       self.release = '7.3.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp7'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
 
       options[:replace_paths] = CPP_PATHS
@@ -62,7 +60,6 @@ module Docs
 
     version '6' do
       self.release = '6.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcc6'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
 
       options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@@ -70,7 +67,6 @@ module Docs
 
     version '6 CPP' do
       self.release = '6.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp6'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
 
       options[:replace_paths] = CPP_PATHS
@@ -78,7 +74,6 @@ module Docs
 
     version '5' do
       self.release = '5.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcc5'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
 
       options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@@ -86,7 +81,6 @@ module Docs
 
     version '5 CPP' do
       self.release = '5.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp5'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
 
       options[:replace_paths] = CPP_PATHS
@@ -94,7 +88,6 @@ module Docs
 
     version '4' do
       self.release = '4.9.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcc4'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gcc/"
 
       options[:root_title] = 'Using the GNU Compiler Collection (GCC)'
@@ -102,7 +95,6 @@ module Docs
 
     version '4 CPP' do
       self.release = '4.9.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gcpp4'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/cpp/"
 
       options[:replace_paths] = CPP_PATHS

+ 0 - 4
lib/docs/scrapers/gnu/gnu_fortran.rb

@@ -8,25 +8,21 @@ module Docs
 
     version '7' do
       self.release = '7.3.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran7'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
     end
 
     version '6' do
       self.release = '6.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran6'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
     end
 
     version '5' do
       self.release = '5.4.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran5'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
     end
 
     version '4' do
       self.release = '4.9.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/gfortran4'
       self.base_url = "https://gcc.gnu.org/onlinedocs/gcc-#{release}/gfortran/"
     end
   end

+ 0 - 1
lib/docs/scrapers/nokogiri2.rb

@@ -3,7 +3,6 @@ module Docs
     self.name = 'Nokogiri'
     self.slug = 'nokogiri'
     self.release = '1.8.1'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Nokogiri'
 
     html_filters.replace 'rdoc/entries', 'nokogiri2/entries'
 

+ 0 - 1
lib/docs/scrapers/numpy.rb

@@ -2,7 +2,6 @@ module Docs
   class Numpy < FileScraper
     self.name = 'NumPy'
     self.type = 'sphinx'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/numpy/reference/'
     self.root_path = 'index.html'
     self.links = {
       home: 'http://www.numpy.org/',

+ 2 - 3
lib/docs/scrapers/openjdk.rb

@@ -1,11 +1,10 @@
 module Docs
   class Openjdk < FileScraper
+    # Downloaded from packages.debian.org/sid/openjdk-8-doc
+    # Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
     self.name = 'OpenJDK'
     self.type = 'openjdk'
     self.root_path = 'overview-summary.html'
-    # Downloaded from packages.debian.org/sid/openjdk-8-doc
-    # Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
-    self.dir = '/Users/Thibaut/DevDocs/Docs/OpenJDK'
 
     html_filters.insert_after 'internal_urls', 'openjdk/clean_urls'
     html_filters.push 'openjdk/entries', 'openjdk/clean_html'

+ 0 - 1
lib/docs/scrapers/perl.rb

@@ -2,7 +2,6 @@ module Docs
   class Perl < FileScraper
     self.name = 'Perl'
     self.type = 'perl'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/Perl'
     self.root_path = 'index.html'
     self.links = {
       home: 'https://www.perl.org/'

+ 2 - 3
lib/docs/scrapers/php.rb

@@ -1,5 +1,7 @@
 module Docs
   class Php < FileScraper
+    # Downloaded from php.net/download-docs.php
+
     include FixInternalUrlsBehavior
 
     self.name = 'PHP'
@@ -23,9 +25,6 @@ module Docs
       code: 'https://git.php.net/?p=php-src.git;a=summary'
     }
 
-    # Downloaded from php.net/download-docs.php
-    self.dir = '/Users/Thibaut/DevDocs/Docs/PHP'
-
     html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title'
     text_filters.push 'php/fix_urls'
 

+ 4 - 8
lib/docs/scrapers/python.rb

@@ -23,33 +23,29 @@ module Docs
       Licensed under the PSF License.
     HTML
 
-    version '3.7' do
+    version '3.7' do # docs.python.org/3.7/download.html
       self.release = '3.7.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Python37' # docs.python.org/3.7/download.html
       self.base_url = 'https://docs.python.org/3.7/'
 
       html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
     end
 
-    version '3.6' do
+    version '3.6' do # docs.python.org/3.6/download.html
       self.release = '3.6.6'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Python36' # docs.python.org/3.6/download.html
       self.base_url = 'https://docs.python.org/3.6/'
 
       html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
     end
 
-    version '3.5' do
+    version '3.5' do # docs.python.org/3.5/download.html
       self.release = '3.5.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Python35' # docs.python.org/3.5/download.html
       self.base_url = 'https://docs.python.org/3.5/'
 
       html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
     end
 
-    version '2.7' do
+    version '2.7' do # docs.python.org/2.7/download.html
       self.release = '2.7.13'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Python27' # docs.python.org/2.7/download.html
       self.base_url = 'https://docs.python.org/2.7/'
 
       html_filters.push 'python/entries_v2', 'sphinx/clean_html', 'python/clean_html'

+ 1 - 1
lib/docs/scrapers/rdoc/minitest.rb

@@ -1,9 +1,9 @@
 module Docs
   class Minitest < Rdoc
+     # Run "rake docs" in the gem directory
     self.name = 'Ruby / Minitest'
     self.slug = 'minitest'
     self.release = '5.10.3'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Minitest' # rake docs
     self.links = {
       code: 'https://github.com/seattlerb/minitest'
     }

+ 0 - 1
lib/docs/scrapers/rdoc/rails.rb

@@ -4,7 +4,6 @@ module Docs
 
     self.name = 'Ruby on Rails'
     self.slug = 'rails'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Rails'
     self.initial_paths = %w(guides/index.html)
     self.links = {
       home: 'http://rubyonrails.org/',

+ 0 - 4
lib/docs/scrapers/rdoc/ruby.rb

@@ -78,22 +78,18 @@ module Docs
 
     version '2.5' do
       self.release = '2.5.0'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby25'
     end
 
     version '2.4' do
       self.release = '2.4.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby24'
     end
 
     version '2.3' do
       self.release = '2.3.6'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby23'
     end
 
     version '2.2' do
       self.release = '2.2.9'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/RDoc/Ruby22'
     end
   end
 end

+ 0 - 1
lib/docs/scrapers/sqlite.rb

@@ -3,7 +3,6 @@ module Docs
     self.name = 'SQLite'
     self.type = 'sqlite'
     self.release = '3.25.2'
-    self.dir = '/Users/Thibaut/DevDocs/Docs/sqlite/'
     self.base_url = 'https://sqlite.org/'
     self.root_path = 'docs.html'
     self.initial_paths = %w(keyword_index.html)

+ 89 - 47
test/lib/docs/core/scrapers/file_scraper_test.rb

@@ -2,16 +2,23 @@ require 'test_helper'
 require 'docs'
 
 class FileScraperTest < MiniTest::Spec
+  ROOT_PATH = File.expand_path('../../../../../../', __FILE__)
+
   class Scraper < Docs::FileScraper
-    self.dir = '/'
     self.html_filters = Docs::FilterStack.new
     self.text_filters = Docs::FilterStack.new
+
+    version 'version' do; end
   end
 
   let :scraper do
     Scraper.new
   end
 
+  let :versioned_scraper do
+    Scraper.versions.first.new
+  end
+
   let :response do
     OpenStruct.new body: 'body', url: Docs::URL.parse(Scraper.base_url)
   end
@@ -22,9 +29,16 @@ class FileScraperTest < MiniTest::Spec
     end
   end
 
+  describe "#source_directory" do
+    it "returns the directory at docs/[slug]" do
+      assert_equal File.join(ROOT_PATH, 'docs', 'scraper'), scraper.source_directory
+      assert_equal File.join(ROOT_PATH, 'docs', 'scraper~version'), versioned_scraper.source_directory
+    end
+  end
+
   describe "#request_one" do
     let :path do
-      File.join(Scraper.dir, 'path')
+      'path'
     end
 
     let :result do
@@ -35,20 +49,34 @@ class FileScraperTest < MiniTest::Spec
       stub(scraper).read_file
     end
 
-    it "reads a file" do
-      mock(scraper).read_file(path)
-      result
+    context "when the source directory doesn't exist" do
+      it "raises an error" do
+        assert_raises Docs::SetupError do
+          result
+        end
+      end
     end
 
-    describe "the returned response object" do
-      it "has a #body" do
-        stub(scraper).read_file { 'body' }
-        assert_equal 'body', result.body
+    context "when the source directory exists" do
+      before do
+        stub(scraper).assert_source_directory_exists
+      end
+
+      it "reads a file" do
+        mock(scraper).read_file(path)
+        result
       end
 
-      it "has a #url" do
-        assert_equal path, result.url.to_s
-        assert_instance_of Docs::URL, result.url
+      describe "the returned response object" do
+        it "has a #body" do
+          stub(scraper).read_file { 'body' }
+          assert_equal 'body', result.body
+        end
+
+        it "has a #url" do
+          assert_equal path, result.url.to_s
+          assert_instance_of Docs::URL, result.url
+        end
       end
     end
   end
@@ -58,49 +86,63 @@ class FileScraperTest < MiniTest::Spec
       %w(one two)
     end
 
-    it "requests the given url" do
-      mock(scraper).request_one('url')
-      scraper.send(:request_all, 'url') {}
-    end
-
-    it "requests the given urls" do
-      requests = []
-      stub(scraper).request_one { |url| requests << url; nil }
-      scraper.send(:request_all, urls) {}
-      assert_equal urls, requests
-    end
-
-    it "yields the responses" do
-      responses = []
-      stub(scraper).request_one { |url| urls.index(url) }
-      scraper.send(:request_all, urls) { |response| responses << response; nil }
-      assert_equal (0...urls.length).to_a, responses
+    context "when the source directory doesn't exist" do
+      it "raises an error" do
+        assert_raises Docs::SetupError do
+          scraper.send(:request_all, urls) {}
+        end
+      end
     end
 
-    context "when the block returns an array" do
-      let :next_urls do
-        %w(three four)
+    context "when the source directory exists" do
+      before do
+        stub(scraper).assert_source_directory_exists
       end
 
-      let :all_urls do
-        urls + %w(three four)
+      it "requests the given url" do
+        mock(scraper).request_one('url')
+        scraper.send(:request_all, 'url') {}
       end
 
-      it "requests the returned urls" do
+      it "requests the given urls" do
         requests = []
-        stub(scraper).request_one { |url| requests << url; url }
-        scraper.send(:request_all, urls) { [next_urls.shift].compact }
-        assert_equal all_urls, requests
+        stub(scraper).request_one { |url| requests << url; nil }
+        scraper.send(:request_all, urls) {}
+        assert_equal urls, requests
       end
 
-      it "yields their responses" do
+      it "yields the responses" do
         responses = []
-        stub(scraper).request_one { |url| all_urls.index(url) }
-        scraper.send :request_all, urls do |response|
-          responses << response
-          [next_urls.shift].compact
+        stub(scraper).request_one { |url| urls.index(url) }
+        scraper.send(:request_all, urls) { |response| responses << response; nil }
+        assert_equal (0...urls.length).to_a, responses
+      end
+
+      context "when the block returns an array" do
+        let :next_urls do
+          %w(three four)
+        end
+
+        let :all_urls do
+          urls + %w(three four)
+        end
+
+        it "requests the returned urls" do
+          requests = []
+          stub(scraper).request_one { |url| requests << url; url }
+          scraper.send(:request_all, urls) { [next_urls.shift].compact }
+          assert_equal all_urls, requests
+        end
+
+        it "yields their responses" do
+          responses = []
+          stub(scraper).request_one { |url| all_urls.index(url) }
+          scraper.send :request_all, urls do |response|
+            responses << response
+            [next_urls.shift].compact
+          end
+          assert_equal (0...all_urls.length).to_a, responses
         end
-        assert_equal (0...all_urls.length).to_a, responses
       end
     end
   end
@@ -126,13 +168,13 @@ class FileScraperTest < MiniTest::Spec
       scraper.send :read_file, 'file'
     end
 
-    it "returns the file's content when the file exists" do
-      stub(File).read('file') { 'content' }
+    it "returns the file's content when the file exists in the source directory" do
+      stub(File).read(File.join(ROOT_PATH, 'docs', 'scraper', 'file')) { 'content' }
       assert_equal 'content', result
     end
 
     it "returns nil when the file doesn't exist" do
-      stub(File).read('file') { raise }
+      stub(File).read(File.join(ROOT_PATH, 'docs', 'scraper', 'file')) { raise }
       assert_nil result
     end
   end