Pārlūkot izejas kodu

Finish OpenJDK scraper

Thibaut Courouble 8 gadi atpakaļ
vecāks
revīzija
2efce74521

BIN
assets/images/docs.png


BIN
assets/images/docs@2x.png


+ 3 - 0
assets/javascripts/news.json

@@ -1,5 +1,8 @@
 [
   [
+    "2017-04-30",
+    "New documentation: <a href=\"/openjdk/\">OpenJDK</a>"
+  ], [
     "2017-02-26",
     "Refreshed design.",
     "Added <a href=\"/settings\">Preferences</a>."

+ 5 - 0
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -413,6 +413,11 @@ credits = [
     '2008-2017 NumPy Developers',
     'NumPy',
     'https://raw.githubusercontent.com/numpy/numpy/master/LICENSE.txt'
+  ], [
+    'OpenJDK',
+    '1993-2017, Oracle and/or its affiliates. All rights reserved.<br>Licensed under the GNU General Public License, version 2, with the Classpath Exception.<br>Various third party code in OpenJDK is licensed under different licenses.<br>Java and OpenJDK are trademarks or registered trademarks of Oracle and/or its affiliates.',
+    'GPLv2',
+    'http://openjdk.java.net/legal/gplv2+ce.html'
   ], [
     'OpenTSDB',
     '2010-2016 The OpenTSDB Authors',

+ 5 - 0
assets/stylesheets/global/_base.scss

@@ -123,6 +123,11 @@ table {
   border-radius: 3px;
 }
 
+caption {
+  font-weight: $boldFontWeight;
+  padding: 0 .7em .3em;
+}
+
 th, td {
   vertical-align: top;
   padding: .3em .7em;

+ 1 - 0
assets/stylesheets/global/_icons.scss

@@ -43,6 +43,7 @@
 %icon-clipboard-white       { background-position: -1rem -2rem; }
 %icon-close-white           { background-position: -2rem -2rem; }
 
+._icon-openjdk:before       { background-position: -2rem 0; }
 ._icon-codeceptjs:before    { background-position: -3rem 0; }
 ._icon-codeception:before   { background-position: -4rem 0; }
 ._icon-sqlite:before        { background-position: -5rem 0; @extend %darkIconFix !optional; }

+ 4 - 19
assets/stylesheets/pages/_openjdk.scss

@@ -1,22 +1,7 @@
 ._openjdk {
-  > ul.inheritance {
-    @extend %note, %note-blue;
-    li {
-      list-style: none;
-    }
-  }
+  @extend %simple;
 
-  ul.blockList, ul.blockListLast {
-    padding-left: 0;
-    li.blockList {
-      list-style: none;
-    }
-  }
-
-  h3 {
-    @extend %block-heading;
-  }
-  h4 {
-    @extend %block-label, %label-blue;
-  }
+  ul.inheritance { list-style: none; }
+  > ul.inheritance { @extend %note, %note-blue; }
+  > ul.inheritance ul.inheritance { margin: 0; }
 }

+ 4 - 0
lib/docs/core/filter.rb

@@ -42,6 +42,10 @@ module Docs
       context[:version]
     end
 
+    def release
+      context[:release]
+    end
+
     def subpath
       @subpath ||= subpath_to(current_url)
     end

+ 1 - 1
lib/docs/core/scraper.rb

@@ -116,7 +116,7 @@ module Docs
       @options ||= self.class.options.deep_dup.tap do |options|
         options.merge! base_url: base_url, root_url: root_url,
                        root_path: root_path, initial_paths: initial_paths,
-                       version: self.class.version
+                       version: self.class.version, release: self.class.release
 
         if root_path?
           (options[:skip] ||= []).concat ['', '/']

+ 102 - 56
lib/docs/filters/openjdk/clean_html.rb

@@ -1,7 +1,11 @@
+# frozen_string_literal: true
+
 module Docs
   class Openjdk
     class CleanHtmlFilter < Filter
       def call
+        css('.topNav', '.subNav', '.bottomNav', '.legalCopy', 'noscript', '.subTitle').remove
+
         # Preserve internal fragment links
         # Transform <a name="foo"><!-- --></a><bar>text</bar>
         #      into <bar id="foo">text</bar>
@@ -12,75 +16,117 @@ module Docs
           end
         end
 
-        # Find the main container
-        # Root page have three containers, we use the second one
-        container = at_css('.contentContainer' + (root_page? ? ':nth-of-type(2)' : ''))
-
-        # Move description to the container top
-        if description_link = at_css('a[href$=".description"]')
-          target = description_link['href'][1..-1]
-          description_nodes = xpath("//*[@id='#{target}'] | //*[@id='#{target}']/following-sibling::*")
-          container.prepend_child(description_nodes)
-          description_nodes.at_css('h2:contains("Description")')&.remove
-          description_link.parent.remove
-        end
-
-        # Remove superfluous and duplicated content
-        css('.subTitle', '.docSummary', '.summary caption', 'caption span.tabEnd').remove
-        css('table[class$="Summary"] > tr > th').each do |th|
-          th.parent.remove
-        end
-        css('h3[id$=".summary"]').each do |header|
-          # Keep only a minimal list of annotation required/optional elements
-          # as with "Methods inherited from class"
-          if header['id'].match? %r{\.element\.summary$}
-            table_summary = header.next_element
-            code_summary = header.document.create_element 'code'
-            table_summary.css('.memberNameLink a').each_with_index do |element, index|
-              code_summary << header.document.create_text_node(', ') if index > 0
-              code_summary << element
-            end
-            table_summary.replace(code_summary)
-          # Remove summary element if detail exists
-          elsif detail_header = at_css("h3[id='#{header['id'].sub('summary','detail')}']")
-            header.next_element.remove
-            header.replace(detail_header.parent.children)
-          end
-        end
-        at_css('.details')&.remove unless at_css('.details h3')
-        css('h3[id$=".summary"]', 'h3[id$=".detail"]', 'caption span').each do |header|
-          header.name = 'h3' if header.name == 'span'
-          content = header.content
-          content.remove! ' Summary'
-          content.remove! ' Detail'
-          header.content = content.pluralize
+        # Remove superfluous content on package pages
+        css('h2:contains("Package Specification")').each do |node|
+          node.next.remove while node.next
+          node.remove
         end
-        css('h4').each do |entry_header|
-          entry_pre = entry_header.next_element
-          entry_header.children = entry_pre.children
-          entry_pre.remove
+
+        # Replace summary tables with their detail content
+        css('h3[id$=".summary"]').each do |node|
+          id = node['id'].sub('summary', 'detail')
+          detail = at_css("h3[id='#{id}']") || at_css("h3[id='#{id.remove('optional.').remove('required.')}']")
+          node.parent.children = detail.parent.children if detail
         end
 
-        # Keep only header and container
-        container.prepend_child(at_css('.header'))
-        @doc = container
+        css('h3[id$=".summary"]', 'h3[id$=".detail"]').each do |node|
+          node.content = node.content.remove(' Summary').remove(' Detail').pluralize
+        end
 
-        # Remove packages not belonging to this version
         if root_page?
-          at_css('.overviewSummary caption h3').content =
-            version + ' ' +
-            at_css('.overviewSummary caption h3').content
-          css('.overviewSummary td.colFirst a').each do |node|
-            unless context[:only_patterns].any? { |pattern| node['href'].match? pattern }
+          css('.header')[1].remove
+          css('.contentContainer')[0].remove
+          css('.contentContainer')[-1].remove
+
+          # Remove packages not belonging to this version
+          css('td.colFirst a').each do |node|
+            unless context[:only_patterns].any? { |pattern| pattern =~ node['href'] }
               node.parent.parent.remove
             end
           end
+
+          at_css('h1').content = "OpenJDK #{release} Documentation" + (version != release ? " (#{version.split(' ').last})" : '')
+        end
+
+        css('table').each do |node|
+          node.remove_attribute 'summary'
+          node.remove_attribute 'cellspacing'
+          node.remove_attribute 'cellpadding'
+          node.remove_attribute 'border'
+        end
+
+        css('span.deprecatedLabel').each { |node| node.name = 'strong' }
+
+        css('.contentContainer', '.docSummary', 'div.header', 'div.description', 'div.summary', 'span', 'tbody').each do |node|
+          node.before(node.children).remove
+        end
+
+        css('tt').each { |node| node.name = 'code' }
+        css('div.block').each { |node| node.name = 'p' unless node.at_css('.block, p') }
+
+        # Create paragraphs
+        css('div > p:first-of-type').each do |node|
+          node.before('<p></p>')
+          node = node.previous
+          node.prepend_child(node.previous) while node.previous
+        end
+
+        css('ul > li > table:only-child').each do |node|
+          node.parent.parent.before(node)
+        end
+
+        css('blockquote > table:only-child', 'blockquote > dl:only-child').each do |node|
+          node.parent.before(node).remove
+        end
+
+        css('blockquote > pre:only-child').each do |node|
+          node.content = node.content.strip_heredoc
+          node.parent.before(node).remove
+        end
+
+        css('blockquote > code').each do |node|
+          node.parent.name = 'pre'
+          node.content = node.content.strip.gsub(/\s+/, ' ')
+        end
+
+        css('dt > cite').each do |node| # remove "See The Java™ Language Specification"
+          node.parent.next_element.remove
+          node.parent.remove
+        end
+
+        css('dt:contains("See Also")').each do |node|
+          unless node.next_element.at_css('a')
+            node.next_element.remove
+            node.remove
+          end
         end
 
-        # Syntax highlighter
+        css('ul.blockList li.blockList:only-child').each do |node|
+          node.first_element_child['id'] ||= node.parent['id'] if node.parent['id']
+          node.parent.before(node.children).remove
+        end
+
+        css('hr + br', 'p + br', 'div + br', 'hr').remove
+
         css('pre').each do |node|
+          node.content = node.content.strip
           node['data-language'] = 'java'
         end
+
+        css('.title').each do |node|
+          node.name = 'h1'
+        end
+
+        css('h3, h4').each do |node|
+          node.name = node.name.sub(/\d/) { |i| i.to_i - 1 }
+        end
+
+        css('*[title]').remove_attr('title')
+
+        css('*[class]').each do |node|
+          node.remove_attribute('class') unless node['class'] == 'inheritance'
+        end
+
         doc
       end
     end

+ 6 - 4
lib/docs/filters/openjdk/clean_urls.rb

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 module Docs
   class Openjdk
     class CleanUrlsFilter < Filter
@@ -8,10 +10,10 @@ module Docs
 
           # The following code ignores most options that InternalUrlsFilter accepts,
           # only the currently used options are considered here.
-          self.class.parent.versions.each do |v|
-            if v.options[:only_patterns].any? { |pattern| path.match? pattern } &&
-               v.options[:skip_patterns].none? { |pattern| path.match? pattern }
-              node['href'] = "/#{v.slug}/#{path}"
+          self.class.parent.versions.each do |version|
+            if version.options[:only_patterns].any? { |pattern| path.match?(pattern) } &&
+               version.options[:skip_patterns].none? { |pattern| path.match?(pattern) }
+              node['href'] = "/#{version.slug}/#{path}"
               break
             end
           end

+ 14 - 8
lib/docs/filters/openjdk/entries.rb

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 module Docs
   class Openjdk
     class EntriesFilter < Docs::EntriesFilter
@@ -13,24 +15,28 @@ module Docs
       end
 
       def get_type
+        return 'Packages' if slug.end_with?('package-summary')
+
         if subtitle = at_css('.header > .subTitle:last-of-type')
-          subtitle.content.strip
+          type = subtitle.content.strip
         else
-          at_css('.header > .title').content.strip.remove 'Package '
+          type = at_css('.header > .title').content.strip.remove 'Package '
         end
+        type = type.split('.')[0..2].join('.')
+        type
       end
 
       def additional_entries
         # Only keep the first found entry with a unique name,
         # i.e. overloaded methods are skipped in index
         css('a[name$=".summary"]').each_with_object({}) do |summary, entries|
-          next if summary['name'] == 'nested.class.summary'
+          next if summary['name'].include?('nested') || summary['name'].include?('constructor') ||
+                  summary['name'].include?('field') || summary['name'].include?('constant')
           summary.parent.css('.memberNameLink a').each do |node|
-            entry_name = node.parent.parent.content.strip
-            entry_name.sub! %r{\(.+?\)}m, '()'
-            id = node['href']
-            id.remove! %r{.*#}
-            entries[entry_name] ||= [name + '.' + entry_name, id]
+            name = node.parent.parent.content.strip
+            name.sub! %r{\(.+?\)}m, '()'
+            id = node['href'].remove(%r{.*#})
+            entries[name] ||= ["#{self.name}.#{name}", id]
           end
         end.values
       end

+ 16 - 13
lib/docs/scrapers/openjdk.rb

@@ -3,17 +3,12 @@ module Docs
     self.name = 'OpenJDK'
     self.type = 'openjdk'
     self.root_path = 'overview-summary.html'
-    self.links = {
-      home: 'http://openjdk.java.net/',
-      code: 'http://hg.openjdk.java.net/jdk8u'
-    }
-    self.release = '8'
     # Downloaded from packages.debian.org/sid/openjdk-8-doc
-    # extracting subdirectoy /usr/share/doc/openjdk-8-jre-headless/api
-    self.dir = '/Users/Thibaut/DevDocs/Docs/Java'
+    # Extracting subdirectory /usr/share/doc/openjdk-8-jre-headless/api
+    self.dir = '/Users/Thibaut/DevDocs/Docs/OpenJDK'
 
-    html_filters.push 'openjdk/entries', 'openjdk/clean_html'
     html_filters.insert_after 'internal_urls', 'openjdk/clean_urls'
+    html_filters.push 'openjdk/entries', 'openjdk/clean_html'
 
     options[:skip_patterns] = [
       /compact[123]-/,
@@ -25,11 +20,15 @@ module Docs
 
     options[:attribution] = <<-HTML
       &copy; 1993&ndash;2017, Oracle and/or its affiliates. All rights reserved.<br>
-      Use is subject to <a href="http://download.oracle.com/otndocs/jcp/java_se-8-mrel-spec/license.html">license terms</a>.<br>
-      We are not endorsed by or affiliated with Oracle.
+      Documentation extracted from Debian's OpenJDK Development Kit package.<br>
+      Licensed under the GNU General Public License, version 2, with the Classpath Exception.<br>
+      Various third party code in OpenJDK is licensed under different licenses (see Debian package).<br>
+      Java and OpenJDK are trademarks or registered trademarks of Oracle and/or its affiliates.
     HTML
 
-    version 'Core' do
+    version '8' do
+      self.release = '8'
+
       options[:only_patterns] = [
         /\Ajava\/beans\//,
         /\Ajava\/io\//,
@@ -55,13 +54,17 @@ module Docs
         /\Ajavax\/tools\//]
     end
 
-    version 'GUI' do
+    version '8 GUI' do
+      self.release = '8'
+
       options[:only_patterns] = [
         /\Ajava\/awt\//,
         /\Ajavax\/swing\//]
     end
 
-    version 'Web' do
+    version '8 Web' do
+      self.release = '8'
+
       options[:only_patterns] = [
         /\Ajava\/applet\//,
         /\Ajava\/rmi\//,

BIN
public/icons/docs/openjdk/16.png


BIN
public/icons/docs/openjdk/16@2x.png