浏览代码

Improve C and C++ scrapers

Fixes #138.
Thibaut 11 年之前
父节点
当前提交
31d5b9d1e0

+ 20 - 4
assets/stylesheets/pages/_c.scss

@@ -1,7 +1,13 @@
 ._c {
   > h2, > h3 { @extend %block-heading; }
   > h4 { @extend %block-label, %label-blue; }
-  > p > code { @extend %label; }
+  .fmbox { @extend %note; }
+  code, .t-mark, .t-mark-rev { @extend %label; }
+
+  .t-mark, .t-mark-rev {
+    white-space: nowrap;
+    @extend %label-green;
+  }
 
   .t-dcl-begin pre {
     margin: 0;
@@ -19,9 +25,19 @@
   }
   .t-sdsc-nopad dl, .t-sdsc-nopad dd { margin: 0; }
 
-  td > h5 {
-    margin: 0;
-    line-height: inherit;
+  td {
+    > h3, > h5 {
+      margin: 0;
+      line-height: inherit;
+    }
+
+    > ul { margin: 0; }
+
+    > .t-dsc-member-div > div { // utility/functional
+      float: left;
+
+      + div { margin-left: .5em; }
+    }
   }
 
   .t-inheritance-diagram {

+ 16 - 7
lib/docs/filters/c/clean_html.rb

@@ -2,13 +2,11 @@ module Docs
   class C
     class CleanHtmlFilter < Filter
       def call
-        if root_page?
-          doc.inner_html = ' '
-          return doc
-        end
+        css('h1').remove if root_page?
 
-        css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc', '.t-dsc-sep', '.t-dcl-sep',
-            '#catlinks', '.ambox-notice', '.mw-cite-backlink', '.t-sdsc-sep:first-child:last-child').remove
+        css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
+            '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
+            '.t-sdsc-sep:first-child:last-child', '.t-example-live-link').remove
 
         css('#bodyContent', '.mw-content-ltr', 'span[style]').each do |node|
           node.before(node.children).remove
@@ -26,10 +24,16 @@ module Docs
           node.content = ' ' if node.content.empty?
         end
 
-        css('tt').each do |node|
+        css('tt', 'span > span.source-cpp').each do |node|
           node.name = 'code'
         end
 
+        css('div > span.source-cpp').each do |node|
+          node.name = 'pre'
+          node.inner_html = node.inner_html.gsub('<br>', "\n")
+          node.content = node.content
+        end
+
         css('div > a > img[alt="About this image"]').each do |node|
           node.parent.parent.remove
         end
@@ -38,6 +42,11 @@ module Docs
           node['href'] = node['href'].remove('.html')
         end
 
+        css('h1 ~ .fmbox').each do |node|
+          node.name = 'div'
+          node.content = node.content
+        end
+
         doc
       end
     end

+ 2 - 0
lib/docs/filters/cpp/entries.rb

@@ -22,6 +22,8 @@ module Docs
       def get_type
         if at_css('#firstHeading').content.include?('C++ keyword')
           'Keywords'
+        elsif subpath.start_with?('experimental')
+          'Experimental libraries'
         elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
           type.strip!
           type.remove! ' library'

+ 5 - 0
lib/docs/scrapers/c.rb

@@ -14,6 +14,11 @@ module Docs
     options[:root_title] = 'C Programming Language'
     options[:skip] = %w(language/history.html)
 
+    options[:fix_urls] = ->(url) do
+      url.sub! %r{\A.+/http%3A/}, "http://"
+      url
+    end
+
     options[:attribution] = <<-HTML
       &copy; cppreference.com<br>
       Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.

+ 5 - 0
lib/docs/scrapers/cpp.rb

@@ -22,6 +22,11 @@ module Docs
     )
     options[:only_patterns] = [/\.html\z/]
 
+    options[:fix_urls] = ->(url) do
+      url.sub! %r{\A.+/http%3A/}, "http://"
+      url
+    end
+
     options[:attribution] = <<-HTML
       &copy; cppreference.com<br>
       Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.