瀏覽代碼

Finish NumPy scraper

Thibaut Courouble 9 年之前
父節點
當前提交
6d36b339e0

二進制
assets/images/icons.png


二進制
assets/images/icons@2x.png


+ 1 - 1
assets/javascripts/news.json

@@ -1,7 +1,7 @@
 [
   [
     "2016-04-24",
-    "New documentation: <a href=\"/apache_pig/\">Apache Pig</a>"
+    "New documentations: <a href=\"/numpy/\">NumPy</a> and <a href=\"/apache_pig/\">Apache Pig</a>"
   ], [
     "2016-04-17",
     "New documentation: <a href=\"/perl/\">Perl</a>"

+ 5 - 0
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -335,6 +335,11 @@ credits = [
     'npm, Inc. and Contributors<br>npm is a trademark of npm, Inc.',
     'npm',
     'https://raw.githubusercontent.com/npm/npm/master/LICENSE'
+  ], [
+    'NumPy',
+    '2008-2016 NumPy Developers',
+    'NumPy',
+    'https://raw.githubusercontent.com/numpy/numpy/master/LICENSE.txt'
   ], [
     'OpenTSDB',
     '2010-2016 The OpenTSDB Authors',

+ 1 - 0
assets/stylesheets/global/_icons.scss

@@ -138,3 +138,4 @@
 ._icon-gcc:before           { background-position: -2rem -11rem; }
 ._icon-perl:before          { background-position: -3rem -11rem; }
 ._icon-apache_pig:before    { background-position: -4rem -11rem; }
+._icon-numpy:before         { background-position: -5rem -11rem; }

+ 2 - 0
assets/stylesheets/pages/_sphinx.scss

@@ -24,6 +24,8 @@
   }
 
   ul.simple { margin: 1em 0; }
+
+  dt > a.external { float: right; }
 }
 
 ._sphinx {

+ 51 - 4
lib/docs/filters/numpy/clean_html.rb

@@ -4,11 +4,58 @@ module Docs
       def call
         @doc = at_css('#spc-section-body')
 
-        css('.headerlink').remove  # remove permalinks
+        css('colgroup').remove
 
-        # Add class for correct syntax highlighting
-        css('pre').each do |pre|
-          pre['class'] = 'python'
+        css('.section', 'a > em', 'dt > tt', 'dt > em', 'dt > big', 'tbody').each do |node|
+          node.before(node.children).remove
+        end
+
+        css('.headerlink').each do |node|
+          id = node['href'][1..-1]
+          node.parent['id'] ||= id
+          doc.at_css("span##{id}").try(:remove)
+          node.remove
+        end
+
+        css('tt', 'span.pre').each do |node|
+          node.name = 'code'
+          node.content = node.content
+          node.remove_attribute 'class'
+        end
+
+        css('h1', 'h2', 'h3').each do |node|
+          node.content = node.content
+        end
+
+        css('p.rubric').each do |node|
+          node.name = 'h4'
+        end
+
+        css('blockquote > div:first-child:last-child').each do |node|
+          node.parent.before(node.parent.children).remove
+          node.before(node.children).remove
+        end
+
+        css('.admonition-example').each do |node|
+          title = node.at_css('.admonition-title')
+          title.name = 'h4'
+          title.remove_attribute 'class'
+          node.before(node.children).remove
+        end
+
+        css('em.xref').each do |node|
+          node.name = 'code'
+        end
+
+        css('div[class*="highlight-"]').each do |node|
+          node.content = node.content.strip
+          node.name = 'pre'
+          node['data-language'] = node['class'][/highlight\-(\w+)/, 1]
+          node['class'] = node['data-language'] # tmp
+        end
+
+        css('table[border]').each do |node|
+          node.remove_attribute 'border'
         end
 
         doc

+ 26 - 21
lib/docs/filters/numpy/entries.rb

@@ -2,38 +2,36 @@ module Docs
   class Numpy
     class EntriesFilter < Docs::EntriesFilter
       def get_name
-        dt = at_css('dt')
-        if dt
-          name = dt.content
-          name.sub! /\(.*/, '()'
-          name.sub! /[\=\[].*/, ''
-          name.remove! 'class '
-          name.remove! 'classmethod '
-          name.remove! 'exception '
+        if dt = at_css('dt')
+          name = dt.content.strip
+          name.sub! %r{\(.*}, '()'
+          name.remove! %r{[\=\[].*}
+          name.remove! %r{\A(class(method)?|exception) }
+          name.remove! %r{\s—.*}
         else
           name = at_css('h1').content.strip
         end
-        name.remove! '¶' # remove permalinks from title
+        name.remove! "\u{00B6}"
         name
       end
 
       def get_type
-        type = name.dup
-        nav_items = at_css('.nav.nav-pills.pull-left').children
-        if nav_items[7]
-          # Infer type from navigation item if possible...
-          type = nav_items[7].content
+        nav_items = css('.nav.nav-pills.pull-left > li')
+
+        if nav_items[3]
+          type = nav_items[3].content
+        elsif nav_items[2] && nav_items[2].content !~ /Manual|Reference/
+          type = nav_items[2].content
         else
-          # ... or the page is probably an overview, so use its title.
-          type = at_css('h1').content
-          type.remove! '¶' # remove permalinks from type
+          type = at_css('h1').content.strip
+          type.remove! "\u{00B6}"
 
           # Handle some edge cases that arent proberly categorized in the docs
-          if type[0..16] == 'numpy.polynomial.'
+          if type.start_with?('numpy.polynomial.')
             type = 'Polynomials'
-          elsif type[0..11] == 'numpy.ufunc.'
-            type = 'Universal functions (ufunc)'
-          elsif type[0..12] == 'numpy.nditer.'
+          elsif type.start_with?('numpy.ufunc.')
+            type = 'Universal functions'
+          elsif type.start_with?('numpy.nditer.')
             type = 'Indexing routines'
           elsif type == 'numpy.core.defchararray.chararray.argsort'
             type = 'String operations'
@@ -43,6 +41,13 @@ module Docs
             type = 'Polynomials'
           end
         end
+
+        type.remove! ' with automatic domain'
+        type.remove! %r{\s*\(.*}
+        type.capitalize!
+        type.sub! 'c-api', 'C API'
+        type.sub! 'Numpy', 'NumPy'
+        type.sub! 'swig', 'Swig'
         type
       end
     end

+ 10 - 12
lib/docs/scrapers/numpy.rb

@@ -2,7 +2,8 @@ module Docs
   class Numpy < FileScraper
     self.name = 'NumPy'
     self.type = 'sphinx'
-    self.root_path = 'routines.html'
+    self.dir = '/Users/Thibaut/DevDocs/Docs/numpy/reference/'
+    self.root_path = 'index.html'
     self.links = {
       home: 'http://www.numpy.org/',
       code: 'https://github.com/numpy/numpy'
@@ -15,22 +16,19 @@ module Docs
     # most pages.
     options[:container] = '.main'
 
-    # "generated" pages seem to be autogenerated from python docstrings.
-    # "routines" are mostly lists that help organize the generated pages.
-    # Everything else is manual-like and probably not desired in Devdocs.
-    options[:only_patterns] = [
-      /routines\.?.*\.html/,
-      /generated.*/]
+    options[:skip_patterns] = [
+      /.*(?<!\.html)\z/,
+      /\Agenerated\/numpy\.chararray\.[\w\-]+.html\z/ # duplicate
+    ]
 
     options[:attribution] = <<-HTML
-      &copy; Copyright 2008-2015, The Scipy community.<br>
-      Licensed under a BSD-new License.
+      &copy; 2008&ndash;2016 NumPy Developers<br>
+      Licensed under the NumPy License.
     HTML
 
     version '1.10' do
-      self.release = '1.10'
-      self.dir = '/vagrant/numpy-html/reference/'
-      # self.base_url = 'http://docs.scipy.org/doc/numpy/reference/'
+      self.release = '1.10.1'
+      self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
     end
   end
 end

二進制
public/icons/docs/numpy/16.png


二進制
public/icons/docs/numpy/16@2x.png


+ 1 - 1
public/icons/docs/numpy/SOURCE

@@ -1 +1 @@
-https://www.scipy.org/_static/images/numpylogo_med.png
+https://github.com/numpy/numpy/blob/master/branding/icons/numpylogoicon.svg