瀏覽代碼

Update and improve Sphinx scrapers

Thibaut Courouble 9 年之前
父節點
當前提交
562463b112

+ 1 - 0
assets/javascripts/views/pages/simple.coffee

@@ -36,6 +36,7 @@ app.views.RethinkdbPage =
 app.views.RubydocPage =
 app.views.SinonPage =
 app.views.SocketioPage =
+app.views.SphinxPage =
 app.views.SphinxSimplePage =
 app.views.TensorflowPage =
 app.views.TypescriptPage =

+ 0 - 8
assets/javascripts/views/pages/sphinx.coffee

@@ -1,8 +0,0 @@
-#= require views/pages/base
-
-class app.views.SphinxPage extends app.views.BasePage
-  prepare: ->
-    @highlightCode @findAll('pre.python'), 'python'
-    @highlightCode @findAll('pre.markup'), 'markup'
-    @highlightCode @findAll('pre.php'), 'php'
-    return

+ 16 - 9
assets/stylesheets/pages/_sphinx.scss

@@ -2,15 +2,24 @@
   h2, h3 { @extend %block-heading; }
   h4 { font-size: 1em; }
   > dl:not(.docutils) > dt { @extend %block-label, %label-blue; }
-  dl > dl > dt { @extend %block-label; }
+  dd > dl:not(.docutils) > dt { @extend %block-label; }
   dt + dt { margin-top: -.5em; }
 
-  .note, .admonition, .versionadded, .versionchanged, .deprecated-removed { @extend %note; }
+  .note, .admonition, div.versionadded, div.versionchanged, .deprecated-removed, .deprecated { @extend %note; }
+
   .important { @extend %note-orange; }
-  .warning, .deprecated-removed { @extend %note-red; }
-  .versionmodified { font-weight: bold; }
+  .warning, .deprecated-removed, .deprecated { @extend %note-red; }
+
+  .versionmodified, span.title {
+    display: block;
+    font-weight: bold;
+  }
+
+  p > code, li > code, dd > code, .docutils > dt > code { @extend %label; }
 
-  p > code, li > code, dd > code { @extend %label; }
+  ul.simple { margin: 1em 0; }
+
+  h2 > a, h3 > a, dt[id] > a.external { float: right; }
 
   .admonition-title {
     float: left;
@@ -20,15 +29,13 @@
     &:after { content: ':'; }
   }
 
-  .admonition > dl {
+  .admonition > dl, .admonition > ul {
     clear: left;
     margin: 0;
   }
   .admonition-title + dl { padding-top: .5em; }
 
-  ul.simple { margin: 1em 0; }
-
-  h2 > a, h3 > a, dt[id] > a.external { float: right; }
+  td > div { margin: 0 !important; }
 }
 
 ._sphinx {

+ 0 - 8
lib/docs/filters/ansible/clean_html.rb

@@ -4,14 +4,6 @@ module Docs
       def call
         @doc = at_css('#page-content')
 
-        css('blockquote > div > pre:first-child:last-child', 'blockquote > div > ul:first-child:last-child').each do |node|
-          node.ancestors('blockquote').first.before(node).remove
-        end
-
-        css('a > em').each do |node|
-          node.before(node.children).remove
-        end
-
         doc
       end
     end

+ 0 - 34
lib/docs/filters/cmake/clean_html.rb

@@ -2,8 +2,6 @@ module Docs
   class Cmake
     class CleanHtmlFilter < Filter
       def call
-        css('.headerlink', '#contents .topic-title').remove
-
         if root_page?
           css('#release-notes', '#index-and-search').remove
 
@@ -12,38 +10,6 @@ module Docs
           end
         end
 
-        css('.contents > ul.simple > li:first-child:last-child').each do |node|
-          node.parent.before(node.at_css('> ul'))
-          node.remove
-        end
-
-        css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('div[class*="highlight-"]').each do |node|
-          pre = node.at_css('pre')
-          pre.content = pre.content
-          node.replace(pre)
-        end
-
-        css('span[id]:empty').each do |node|
-          node.next_element['id'] = node['id']
-          node.remove
-        end
-
-        css('.section').each do |node|
-          if node['id']
-            if node.first_element_child['id']
-              node.element_children[1]['id'] = node['id']
-            else
-              node.first_element_child['id'] = node['id']
-            end
-          end
-
-          node.before(node.children).remove
-        end
-
         doc
       end
     end

+ 0 - 31
lib/docs/filters/codeigniter/clean_html.rb

@@ -1,31 +0,0 @@
-module Docs
-  class Codeigniter
-    class CleanHtmlFilter < Filter
-      def call
-        css('.headerlink').remove
-
-        css('h1', 'h2', 'h3', 'h4', 'h5', 'pre').each do |node|
-          node.content = node.content
-        end
-
-        css('div[class^="highlight-"]').each do |node|
-          node.content = node.content.strip
-          node.name = 'pre'
-          node['class'] = 'php' if node['class'].include?('highlight-ci')
-        end
-
-        css('table').each do |node|
-          node.remove_attribute 'border'
-          node.remove_attribute 'cellpadding'
-        end
-
-        css('.section').each do |node|
-          node.first_element_child['id'] = node['id'] if node['id']
-          node.before(node.children).remove
-        end
-
-        doc
-      end
-    end
-  end
-end

+ 3 - 1
lib/docs/filters/codeigniter/entries.rb

@@ -2,7 +2,9 @@ module Docs
   class Codeigniter
     class EntriesFilter < Docs::EntriesFilter
       def get_name
-        at_css('h1').content.strip
+        name = at_css('h1').content.strip
+        name.remove! "\u{00B6}"
+        name
       end
 
       def get_type

+ 0 - 37
lib/docs/filters/django/clean_html.rb

@@ -4,43 +4,6 @@ module Docs
       def call
         @doc = at_css('.yui-g')
 
-        css('.section', 'a > em').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('tt', 'span.pre').each do |node|
-          node.name = 'code'
-          node.content = node.content
-          node.remove_attribute 'class'
-        end
-
-        css('.headerlink').each do |node|
-          id = node['href'][1..-1]
-          node.parent['id'] ||= id
-          doc.at_css("span##{id}").try(:remove)
-          node.remove
-        end
-
-        css('h1', 'h2', 'h3', 'dt').each do |node|
-          links = node.css('a').remove
-          node.content = node.content
-          node << links
-        end
-
-        css('div[class^="highlight-"]').each do |node|
-          node.name = 'pre'
-          node['class'] = node['data-language'] = case node['class']
-            when 'highlight-python', 'highlight-default' then 'python'
-            when 'highlight-html+django' then 'markup'
-            else ''
-          end
-          node.content = node.at_css('pre').content
-        end
-
-        css('code > code').each do |node|
-          node.before(node.children).remove
-        end
-
         doc
       end
     end

+ 0 - 67
lib/docs/filters/matplotlib/clean_html.rb

@@ -1,67 +0,0 @@
-module Docs
-  class Matplotlib
-    class CleanHtmlFilter < Filter
-      def call
-        css('.headerlink', 'hr').remove
-
-        css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('div[class*="highlight-"]').each do |node|
-          pre = node.at_css('pre')
-          pre.content = pre.content
-          pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
-          node.replace(pre)
-        end
-
-        css('span[id]:empty').each do |node|
-          node.next_element['id'] = node['id']
-          node.remove
-        end
-
-        css('.section').each do |node|
-          if node['id']
-            if node.first_element_child['id']
-              node.element_children[1]['id'] = node['id']
-            else
-              node.first_element_child['id'] = node['id']
-            end
-          end
-
-          node.before(node.children).remove
-        end
-
-        css('h2 > a > code').each do |node|
-          node.parent.before(node.content).remove
-        end
-
-        css('dt[id]').each do |node|
-          node.inner_html = "<code>#{node.content.strip}</code>"
-        end
-
-        css('li > p:first-child:last-child').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('table[border]').each do |node|
-          node.remove_attribute 'border'
-        end
-
-        css('code[class]').each do |node|
-          node.remove_attribute 'class'
-        end
-
-        css('h1').each do |node|
-          node.content = node.content
-        end
-
-        css('p.rubric').each do |node|
-          node.name = 'h4'
-        end
-
-        doc
-      end
-    end
-  end
-end

+ 0 - 54
lib/docs/filters/numpy/clean_html.rb

@@ -4,60 +4,6 @@ module Docs
       def call
         @doc = at_css('#spc-section-body')
 
-        css('colgroup').remove
-
-        css('.section', 'a > em', 'dt > tt', 'dt > em', 'dt > big', 'tbody').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('.headerlink').each do |node|
-          id = node['href'][1..-1]
-          node.parent['id'] ||= id
-          doc.at_css("span##{id}").try(:remove)
-          node.remove
-        end
-
-        css('tt', 'span.pre').each do |node|
-          node.name = 'code'
-          node.content = node.content
-          node.remove_attribute 'class'
-        end
-
-        css('h1', 'h2', 'h3').each do |node|
-          node.content = node.content
-        end
-
-        css('p.rubric').each do |node|
-          node.name = 'h4'
-        end
-
-        css('blockquote > div:first-child:last-child').each do |node|
-          node.parent.before(node.parent.children).remove
-          node.before(node.children).remove
-        end
-
-        css('.admonition-example').each do |node|
-          title = node.at_css('.admonition-title')
-          title.name = 'h4'
-          title.remove_attribute 'class'
-          node.before(node.children).remove
-        end
-
-        css('em.xref').each do |node|
-          node.name = 'code'
-        end
-
-        css('div[class*="highlight-"]').each do |node|
-          node.content = node.content.strip
-          node.name = 'pre'
-          node['data-language'] = node['class'][/highlight\-(\w+)/, 1]
-          node['class'] = node['data-language'] # tmp
-        end
-
-        css('table[border]').each do |node|
-          node.remove_attribute 'border'
-        end
-
         doc
       end
     end

+ 1 - 46
lib/docs/filters/python/clean_html.rb

@@ -4,21 +4,6 @@ module Docs
       def call
         @doc = at_css '.body'
 
-        css('> .section').each do |node|
-          node.before(node.children).remove
-        end
-
-        # Clean inline code elements
-
-        css('tt.literal').each do |node|
-          node.before(node.children).remove
-        end
-
-        css('tt', 'span.pre').each do |node|
-          node.name = 'code'
-          node.remove_attribute 'class'
-        end
-
         root_page? ? root : other
 
         doc
@@ -26,14 +11,9 @@ module Docs
 
       def root
         at_css('h1').content = 'Python'
-        css('> p').remove
       end
 
       def other
-        css('.headerlink', 'hr').remove
-
-        # Clean headings
-
         css('h1').each do |node|
           node.content = node.content.sub!(/\A[\d\.]+/) do |str|
             rgx = /\A#{str}/
@@ -43,32 +23,7 @@ module Docs
         end
 
         css('h2', 'h3', 'h4').each do |node|
-          node.css('a').each do |link|
-            link.before(link.children).remove
-          end
-          node.child.content = node.child.content.remove @levelRegexp
-        end
-
-        css('dt').each do |node|
-          node.content = node.content
-        end
-
-        # Remove blockquotes
-        css('blockquote').each do |node|
-          node.before(node.children).remove
-        end
-
-        # Remove code highlighting
-        css('[class*="highlight-python"]').each do |node|
-          pre = node.at_css('pre')
-          pre.content = pre.content
-          pre['class'] = 'python'
-          node.replace(pre)
-        end
-
-        # Remove <table> border attribute
-        css('table[border]').each do |node|
-          node.remove_attribute 'border'
+          node.inner_html = node.inner_html.remove @levelRegexp
         end
       end
     end

+ 97 - 0
lib/docs/filters/sphinx/clean_html.rb

@@ -0,0 +1,97 @@
+module Docs
+  class Sphinx
+    class CleanHtmlFilter < Filter
+      def call
+        css('.headerlink', 'hr', '#contents .topic-title', '#topics .topic-title', 'colgroup').remove
+
+        css('.contents > ul:first-child:last-child.simple > li:first-child:last-child').each do |node|
+          node.parent.before(node.at_css('> ul')) if node.at_css('> ul')
+          node.remove
+        end
+
+        css('em.xref', 'tt').each do |node|
+          node.name = 'code'
+        end
+
+        css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code', 'tbody', 'code > code', 'a > em').each do |node|
+          node.before(node.children).remove
+        end
+
+        css('div[class*="highlight-"]').each do |node|
+          pre = node.at_css('pre')
+          pre.content = pre.content
+          pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
+          pre['data-language'] = 'php' if pre['data-language'] == 'ci'
+          pre['data-language'] = 'markup' if pre['data-language'] == 'html+django'
+          pre['data-language'] = 'python' if pre['data-language'] == 'default' || pre['data-language'].start_with?('python')
+          node.replace(pre)
+        end
+
+        css('span[id]:empty').each do |node|
+          (node.next_element || node.previous_element)['id'] ||= node['id'] if node.next_element || node.previous_element
+          node.remove
+        end
+
+        css('.section').each do |node|
+          if node['id']
+            if node.first_element_child['id']
+              node.element_children[1]['id'] = node['id'] if node.element_children[1]
+            else
+              node.first_element_child['id'] = node['id']
+            end
+          end
+
+          node.before(node.children).remove
+        end
+
+        css('h2 > a > code').each do |node|
+          node.parent.before(node.content).remove
+        end
+
+        css('dt').each do |node|
+          next unless node['id'] || node.at_css('code')
+          links = []
+          links << node.children.last.remove while node.children.last.try(:name) == 'a'
+          node.inner_html = "<code>#{node.content.strip}</code> "
+          links.reverse_each { |link| node << link }
+        end
+
+        css('li > p:first-child:last-child').each do |node|
+          node.before(node.children).remove
+        end
+
+        css('blockquote > div:first-child:last-child').each do |node|
+          node.parent.before(node.parent.children).remove
+          node.before(node.children).remove
+        end
+
+        css('.admonition-example').each do |node|
+          title = node.at_css('.admonition-title')
+          title.name = 'h4'
+          title.remove_attribute 'class'
+          node.before(node.children).remove
+        end
+
+        css('table[border]').each do |node|
+          node.remove_attribute 'border'
+          node.remove_attribute 'cellpadding'
+          node.remove_attribute 'cellspacing'
+        end
+
+        css('code[class]').each do |node|
+          node.remove_attribute 'class'
+        end
+
+        css('h1').each do |node|
+          node.content = node.content
+        end
+
+        css('p.rubric').each do |node|
+          node.name = 'h4'
+        end
+
+        doc
+      end
+    end
+  end
+end

+ 1 - 1
lib/docs/scrapers/ansible.rb

@@ -9,7 +9,7 @@ module Docs
       code: 'https://github.com/ansible/ansible'
     }
 
-    html_filters.push 'ansible/entries', 'ansible/clean_html', 'codeigniter/clean_html'
+    html_filters.push 'ansible/entries', 'ansible/clean_html', 'sphinx/clean_html'
 
     options[:skip] = %w(
       glossary.html

+ 1 - 1
lib/docs/scrapers/cmake.rb

@@ -7,7 +7,7 @@ module Docs
       code: 'https://cmake.org/gitweb?p=cmake.git;a=summary'
     }
 
-    html_filters.push 'cmake/clean_html', 'cmake/entries', 'title'
+    html_filters.push 'cmake/clean_html', 'sphinx/clean_html', 'cmake/entries', 'title'
 
     options[:container] = '.body'
     options[:title] = false

+ 2 - 2
lib/docs/scrapers/codeigniter.rb

@@ -9,7 +9,7 @@ module Docs
       code: 'https://github.com/bcit-ci/CodeIgniter'
     }
 
-    html_filters.push 'codeigniter/clean_html', 'codeigniter/entries'
+    html_filters.push 'codeigniter/entries', 'sphinx/clean_html'
 
     options[:container] = '.document'
 
@@ -36,7 +36,7 @@ module Docs
     HTML
 
     version '3.0' do
-      self.release = '3.0.4'
+      self.release = '3.0.6'
     end
   end
 end

+ 1 - 1
lib/docs/scrapers/django.rb

@@ -8,7 +8,7 @@ module Docs
       code: 'https://github.com/django/django'
     }
 
-    html_filters.push 'django/entries', 'django/clean_html'
+    html_filters.push 'django/entries', 'sphinx/clean_html', 'django/clean_html'
     text_filters.push 'django/fix_urls'
 
     options[:container] = '#bd'

+ 1 - 1
lib/docs/scrapers/matplotlib.rb

@@ -8,7 +8,7 @@ module Docs
       code: 'https://github.com/matplotlib/matplotlib'
     }
 
-    html_filters.push 'matplotlib/entries', 'matplotlib/clean_html'
+    html_filters.push 'matplotlib/entries', 'sphinx/clean_html'
 
     options[:container] = '.body'
     options[:skip] = %w(api_changes.html)

+ 7 - 2
lib/docs/scrapers/numpy.rb

@@ -9,7 +9,7 @@ module Docs
       code: 'https://github.com/numpy/numpy'
     }
 
-    html_filters.push 'numpy/entries', 'numpy/clean_html'
+    html_filters.push 'numpy/entries', 'numpy/clean_html', 'sphinx/clean_html'
 
     # .main contains more than the page's content alone, but we need something
     # that includes the navigation bar as well in order to guess the type of
@@ -26,8 +26,13 @@ module Docs
       Licensed under the NumPy License.
     HTML
 
+    version '1.11' do
+      self.release = '1.11.0'
+      self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
+    end
+
     version '1.10' do
-      self.release = '1.10.1'
+      self.release = '1.10.4'
       self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
     end
   end

+ 4 - 4
lib/docs/scrapers/python.rb

@@ -20,19 +20,19 @@ module Docs
     HTML
 
     version '3.5' do
-      self.release = '3.5.1'
+      self.release = '3.5.2'
       self.dir = '/Users/Thibaut/DevDocs/Docs/Python35' # docs.python.org/3.5/download.html
       self.base_url = 'https://docs.python.org/3.5/'
 
-      html_filters.push 'python/entries_v3', 'python/clean_html'
+      html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
     end
 
     version '2.7' do
-      self.release = '2.7.10'
+      self.release = '2.7.12'
       self.dir = '/Users/Thibaut/DevDocs/Docs/Python27' # docs.python.org/2.7/download.html
       self.base_url = 'https://docs.python.org/2.7/'
 
-      html_filters.push 'python/entries_v2', 'python/clean_html'
+      html_filters.push 'python/entries_v2', 'sphinx/clean_html', 'python/clean_html'
     end
   end
 end

+ 5 - 0
lib/docs/scrapers/sphinx.rb

@@ -0,0 +1,5 @@
+module Docs
+  class Sphinx < Scraper
+    self.abstract = true
+  end
+end