浏览代码

Improve Erlang scraper

Closes #432.
Thibaut Courouble 9 年之前
父节点
当前提交
2346300cee

+ 1 - 0
assets/stylesheets/pages/_erlang.scss

@@ -2,6 +2,7 @@
   @extend %simple;
 
   h3.code { @extend %code; }
+  code.code { @extend %label; }
   .note { @extend %note; }
   .warning { @extend %note, %note-red; }
   .note .label, .warning .label { font-weight: bold; }

+ 3 - 7
lib/docs/filters/erlang/clean_html.rb

@@ -45,7 +45,7 @@ module Docs
           node.content = content.capitalize if content == content.upcase
         end
 
-        css('p > span.bold_code:first-child ~ br:last-child').each do |node|
+        css('p > .bold_code:first-child ~ br:last-child').each do |node|
           parent = node.parent
           parent.name = 'h3'
           parent['class'] = 'code'
@@ -54,15 +54,11 @@ module Docs
           parent.inner_html = parent.inner_html.strip
         end
 
-        css('span.code').each do |node|
-          node.name = 'code'
-        end
-
-        css('pre *:not(a)').each do |node|
+        css('pre:not(.REFTYPES) *:not(a)', 'a[href^=javascript]').each do |node|
           node.before(node.children).remove
         end
 
-        css('pre').each do |node|
+        css('pre:not(.REFTYPES)').each do |node|
           node.inner_html = node.inner_html.strip_heredoc
         end
 

+ 50 - 11
lib/docs/filters/erlang/entries.rb

@@ -3,14 +3,32 @@ module Docs
     class EntriesFilter < Docs::EntriesFilter
       def get_name
         name = at_css('h1').content.strip
-        name.prepend 'Guide: ' if doc.inner_html.include?('<strong>User\'s Guide</strong>')
+        name << " (#{type.remove('Guide: ')})" if name == '1 Introduction'
         name
       end
 
       def get_type
-        type = subpath[/lib\/(.+?)[\-\/]/, 1]
-        type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10
-        type
+        if subpath.start_with?('lib/')
+          type = subpath[/lib\/(.+?)[\-\/]/, 1]
+          type << "/#{name}" if type == 'stdlib' && entry_nodes.length >= 10
+          type
+        elsif subpath.start_with?('doc/')
+          type = subpath[/doc\/(.+?)\//, 1]
+          type.capitalize!
+          type.sub! '_', ' '
+          type.sub! 'Oam', 'OAM'
+          type.remove! ' Guide'
+          type.prepend 'Guide: '
+          type
+        elsif subpath.start_with?('erts')
+          type = 'ERTS'
+          if name =~ /\A\d/
+            type.prepend 'Guide: '
+          elsif entry_nodes.length > 0
+            type << "/#{name}"
+          end
+          type
+        end
       end
 
       def include_default_entry?
@@ -18,17 +36,38 @@ module Docs
       end
 
       def additional_entries
-        entry_nodes.map do |node|
-          id = node['name']
-          name = id.gsub %r{\-(?<arity>.*)\z}, '/\k<arity>'
-          name.remove! 'Module:'
-          name.prepend "#{self.name}:"
-          [name, id]
+        return [] unless include_default_entry?
+
+        if subpath.start_with?('lib/')
+          entry_nodes.map do |node|
+            id = node['name']
+            name = id.gsub %r{\-(?<arity>.*)\z}, '/\k<arity>'
+            name.remove! 'Module:'
+            name.prepend "#{self.name}:"
+            [name, id]
+          end
+        elsif subpath.start_with?('doc/')
+          []
+        elsif subpath.start_with?('erts')
+          return [] if type.start_with?('Guide')
+          entry_nodes.map do |node|
+            id = node['href'][/#(.+)/, 1]
+            name = node.content.strip
+            name.remove! 'Module:'
+            name.prepend "#{self.name}:"
+            [name, id]
+          end
         end
       end
 
       def entry_nodes
-        @entry_nodes ||= css('div.REFBODY + p > a')
+        @entry_nodes ||= if subpath.start_with?('lib/')
+          css('div.REFBODY + p > a')
+        elsif subpath.start_with?('erts')
+          link = at_css(".flipMenu a[href='#{File.basename(subpath, '.html')}']")
+          list = link.parent.parent
+          list['class'] == 'flipMenu' ? [] : list.css('a').to_a.tap { |a| a.delete(link); }
+        end
       end
     end
   end

+ 11 - 1
lib/docs/filters/erlang/pre_clean_html.rb

@@ -2,7 +2,17 @@ module Docs
   class Erlang
     class PreCleanHtmlFilter < Filter
       def call
-        css('.flipMenu li[title] > a').remove
+        css('.flipMenu li[title] > a').remove unless subpath.start_with?('erts') # perf
+
+        css('.REFTYPES').each do |node|
+          node.name = 'pre'
+        end
+
+        css('span.bold_code', 'span.code').each do |node|
+          node.name = 'code'
+          node.inner_html = node.inner_html.strip.gsub(/\s+/, ' ')
+        end
+
         doc
       end
     end

+ 6 - 1
lib/docs/scrapers/erlang.rb

@@ -10,13 +10,18 @@ module Docs
     html_filters.insert_after 'container', 'erlang/pre_clean_html'
     html_filters.push 'erlang/entries', 'erlang/clean_html'
 
-    options[:only_patterns] = [/\Alib/]
+    options[:only_patterns] = [
+      /\Alib/,
+      /\Adoc\/\w+\//,
+      /\Aerts.+\/html/
+    ]
 
     options[:skip_patterns] = [
       /pdf/,
       /release_notes/,
       /result/,
       /java/,
+      /\.erl\z/,
       /\/html\/.*_app\.html\z/,
       /_examples\.html\z/,
       /\Alib\/edoc/,