Bladeren bron

Merge pull request #1708 from Nicolapps/scala-3-scraper

Add a scraper for Scala 3
Simon Legner 3 jaren geleden
bovenliggende
commit
39d3696efe

+ 2 - 2
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -794,9 +794,9 @@ credits = [
     'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE'
   ], [
     'Scala',
-    '2002-2019 EPFL, with contributions from Lightbend',
+    '2002-2022 EPFL, with contributions from Lightbend',
     'Apache',
-    'https://raw.githubusercontent.com/scala/scala-lang/master/license.md'
+    'https://www.scala-lang.org/license/'
   ], [
     'scikit-image',
     '2019 the scikit-image team',

+ 52 - 2
assets/javascripts/vendor/prism.js

@@ -1,5 +1,5 @@
-/* PrismJS 1.26.0
-https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markup-templating+matlab+nginx+nim+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+shell-session+sql+typescript+yaml+zig */
+/* PrismJS 1.27.0
+https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markup-templating+matlab+nginx+nim+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+typescript+yaml+zig */
 /// <reference lib="WebWorker"/>
 
 var _self = (typeof window !== 'undefined')
@@ -4660,6 +4660,56 @@ Prism.languages.insertBefore('scss', 'function', {
 
 Prism.languages.scss['atrule'].inside.rest = Prism.languages.scss;
 
+Prism.languages.scala = Prism.languages.extend('java', {
+	'triple-quoted-string': {
+		pattern: /"""[\s\S]*?"""/,
+		greedy: true,
+		alias: 'string'
+	},
+	'string': {
+		pattern: /("|')(?:\\.|(?!\1)[^\\\r\n])*\1/,
+		greedy: true
+	},
+	'keyword': /<-|=>|\b(?:abstract|case|catch|class|def|do|else|extends|final|finally|for|forSome|if|implicit|import|lazy|match|new|null|object|override|package|private|protected|return|sealed|self|super|this|throw|trait|try|type|val|var|while|with|yield)\b/,
+	'number': /\b0x(?:[\da-f]*\.)?[\da-f]+|(?:\b\d+(?:\.\d*)?|\B\.\d+)(?:e\d+)?[dfl]?/i,
+	'builtin': /\b(?:Any|AnyRef|AnyVal|Boolean|Byte|Char|Double|Float|Int|Long|Nothing|Short|String|Unit)\b/,
+	'symbol': /'[^\d\s\\]\w*/
+});
+
+Prism.languages.insertBefore('scala', 'triple-quoted-string', {
+	'string-interpolation': {
+		pattern: /\b[a-z]\w*(?:"""(?:[^$]|\$(?:[^{]|\{(?:[^{}]|\{[^{}]*\})*\}))*?"""|"(?:[^$"\r\n]|\$(?:[^{]|\{(?:[^{}]|\{[^{}]*\})*\}))*")/i,
+		greedy: true,
+		inside: {
+			'id': {
+				pattern: /^\w+/,
+				greedy: true,
+				alias: 'function'
+			},
+			'escape': {
+				pattern: /\\\$"|\$[$"]/,
+				greedy: true,
+				alias: 'symbol'
+			},
+			'interpolation': {
+				pattern: /\$(?:\w+|\{(?:[^{}]|\{[^{}]*\})*\})/,
+				greedy: true,
+				inside: {
+					'punctuation': /^\$\{?|\}$/,
+					'expression': {
+						pattern: /[\s\S]+/,
+						inside: Prism.languages.scala
+					}
+				}
+			},
+			'string': /[\s\S]+/
+		}
+	}
+});
+
+delete Prism.languages.scala['class-name'];
+delete Prism.languages.scala['function'];
+
 (function (Prism) {
 
 	// CAREFUL!

+ 39 - 0
assets/stylesheets/pages/_scala.scss

@@ -1,4 +1,43 @@
 ._scala {
   @extend %simple;
+  
   .deprecated { @extend %label-red; }
+
+  .attributes dl,
+  .attributes pre { 
+    margin: 0;
+  }
+  
+  .related-types {
+    @extend %pre;
+    margin: 0;
+    white-space: normal;
+  }
+
+  .links {
+    @extend %box;
+    margin-left: -1rem;
+    text-align: center;
+    padding: .5em;
+
+    a { padding: .4em }
+
+    @include print {
+      display: none;
+    }
+  }
+
+  .source-link {
+    float: right;
+    font-size: .75rem;
+    color: var(--linkColor);
+    cursor: pointer;
+    @extend %user-select-none;
+
+    &:hover { text-decoration: underline; }
+
+    @include print {
+      display: none;
+    }
+  }
 }

+ 1 - 1
lib/docs/filters/scala/clean_html.rb → lib/docs/filters/scala/clean_html_v2.rb

@@ -1,6 +1,6 @@
 module Docs
   class Scala
-    class CleanHtmlFilter < Filter
+    class CleanHtmlV2Filter < Filter
       def call
         @doc = at_css('#content')
 

+ 253 - 0
lib/docs/filters/scala/clean_html_v3.rb

@@ -0,0 +1,253 @@
+# frozen_string_literal: true
+
+module Docs
+  class Scala
+    class CleanHtmlV3Filter < Filter
+      def call
+        # Remove unneeded elements
+        css('.documentableFilter, .documentableAnchor, .documentableBrief').remove
+
+        format_title
+        format_signature
+        format_top_links
+        format_metadata
+
+        # Remove the redundant long descriptions on the main page
+        if slug == 'index'
+          css('.contents').remove
+        else
+          format_members
+        end
+        
+        simplify_html
+
+        doc
+      end
+
+      private
+
+      # Formats the title of the page
+      def format_title
+        cover_header = at_css('.cover-header')
+        return if cover_header.nil?
+
+        # Add the kind of page to the title
+        icon = cover_header.at_css('.micon')
+        types = {
+          cl: 'Class',
+          ob: 'Object',
+          tr: 'Trait',
+          en: 'Enum',
+          ty: 'Type',
+          pa: 'Package',
+        }
+        type_id = cover_header.at_css('.micon')['class']
+        type_id.remove!('micon ')
+        type_id.remove!('-wc')
+        type = types[type_id.to_sym]
+        name = CGI.escapeHTML cover_header.at_css('h1').text
+
+        # Add the package name
+        package = at_css('.breadcrumbs a:nth-of-type(3)').text
+        package = package + '.' unless name.empty? || package.empty?
+
+        # Replace the title
+        title = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
+        cover_header.replace "<h1>#{title}</h1>"
+      end
+
+      # Formats the signature block at the top of the page
+      def format_signature
+        signature = at_css('.signature')
+        signature_annotations = signature.at_css('.annotations')
+        signature_annotations.name = 'small' unless signature_annotations.nil?
+        signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
+      end
+
+      # Formats the top links (companion page, source code)
+      def format_top_links
+        # Companion page (e.g. List ↔ List$)
+        links = []
+        at_css('.attributes').css('dt').each do |dt|
+          next if dt.content.strip != 'Companion:'
+          dd = dt.next_sibling
+
+          companion_link = dd.at_css('a')
+          companion_link.content = "Companion #{companion_link.content}"
+          links.append(companion_link.to_html)
+
+          dt.remove
+          dd.remove
+        end
+
+        # Source code
+        at_css('.attributes').css('dt').each do |dt|
+          next if dt.content.strip != 'Source:'
+          dd = dt.next_sibling
+          
+          source_link = dd.at_css('a')
+          source_link.content = 'Source code'
+          links.append(source_link.to_html)
+
+          dt.remove
+          dd.remove
+        end
+
+        # Format the links
+        title = at_css('h1')
+        title.add_next_sibling("<div class=\"links\">#{links.join(' • ')}</div>")
+      end
+
+      # Metadata about the whole file (e.g. supertypes)
+      def format_metadata
+        # Format the values
+        css('.tabs.single .monospace').each do |node|
+          node.css('> div').each do |div|
+            div['class'] = 'member'
+          end
+
+          node['class'] = 'related-types'
+
+          if node.children.count > 15 # Hide too large lists
+            node.replace "<details>
+              <summary>#{node.children.count} types</summary>
+              #{node.to_html}
+            </details>"
+          end
+        end
+
+        attributes = at_css('.attributes')
+
+        # Change the HTML structure
+        tabs_names = css('.tabs.single .names .tab')
+        tabs_contents = css('.tabs.single .contents .tab')
+        tabs_names.zip(tabs_contents).each do |name, contents|
+          next if name.content == "Graph"
+
+          attributes.add_child("<dt>#{name.content}</dt>")
+          attributes.add_child("<dd>#{contents.inner_html.strip}</dd>")
+        end
+
+        convert_dl_to_table(attributes)
+
+        tabs = at_css('.tabs')
+        tabs.remove unless tabs.nil? || tabs.parent['class'] == 'membersList'
+      end
+
+      # Format the members (methods, values…)
+      def format_members
+        # Section headings
+        css('.cover h2').each do |node|
+          node.name = 'h3'
+        end
+        css('h2:not(#signature)').remove
+        css(
+          '.membersList h3',
+
+          # Custom group headers for which Scaladoc generates invalid HTML
+          # (<h3><p>…</p></h3>)
+          '.documentableList > h3:empty + p'
+        ).each do |node|
+          node.name = 'h2'
+          node.content = node.content
+        end
+
+        # Individual members
+        css('.documentableElement').each do |element|
+          header = element.at_css('.header')
+          header.name = 'h3'
+
+          id = element['id']
+          element.remove_attribute('id')
+          header['id'] = id unless id.nil?
+
+          annotations = element.at_css('.annotations')
+          annotations.name = 'small'
+          header.prepend_child(annotations)
+
+          # View source
+          element.css('dt').each do |dt|
+            next if dt.content.strip != 'Source:'
+            dd = dt.next_sibling
+            
+            source_link = dd.at_css('a')
+            source_link.content = 'Source'
+            source_link['class'] = 'source-link'
+            header.prepend_child(source_link)
+
+            dt.remove
+            dd.remove
+          end
+
+          # Format attributes as a table
+          dl = element.at_css('.attributes')
+          convert_dl_to_table(dl) unless dl.nil?
+
+          # Remove the unnecessary wrapper element
+          element.replace(element.inner_html)
+        end
+
+        # Remove deprecated sections
+        css('.documentableList').each do |list|
+          header = list.at_css('.groupHeader')
+          list.remove if (header.text.downcase.include? 'deprecate' rescue false)
+        end
+
+        # Code blocks
+        css('pre > code').each do |code|
+          pre = code.parent
+          pre['data-language'] = 'scala'
+          pre.inner_html = code.inner_html
+        end
+      end
+
+      # Simplify the HTML structure by removing useless elements
+      def simplify_html
+        # Remove unneeded parts of the document
+        @doc = at_css('#content > div')
+
+        # Remove the useless elements around members
+        css('.documentableList > *').each do |element|
+          element.parent = doc
+        end
+        at_css('.membersList').remove
+
+        # Remove useless classes
+        css('.header, .groupHeader, .cover, .documentableName').each do |element|
+          element.remove_attribute('class')
+        end
+
+        # Remove useless attributes
+        css('[t]').each do |element|
+          element.remove_attribute('t')
+        end
+
+        # Remove useless wrapper elements
+        css('.docs, .doc, .memberDocumentation, span, div:not([class])').each do |element|
+          element.replace(element.children)
+        end
+      end
+
+      def convert_dl_to_table(dl)
+        table = Nokogiri::XML::Node.new('table', doc)
+        table['class'] = 'attributes'
+
+        dl.css('> dt').each do |dt|
+          dd = dt.next_element
+          has_dd = dd.name == 'dd' rescue false
+
+          tr = Nokogiri::XML::Node.new('tr', doc)
+          colspan = has_dd ? '' : ' colspan="2"' # handle <dt> without following <dt>
+          tr.add_child("<th#{colspan}>#{dt.inner_html.sub(/:$/, '')}</th>")
+
+          tr.add_child("<td>#{dd.inner_html}</td>") if has_dd
+
+          table.add_child(tr)
+        end
+
+        dl.replace(table)
+      end
+
+    end
+  end
+end

+ 3 - 3
lib/docs/filters/scala/entries.rb → lib/docs/filters/scala/entries_v2.rb

@@ -1,6 +1,6 @@
 module Docs
   class Scala
-    class EntriesFilter < Docs::EntriesFilter
+    class EntriesV2Filter < Docs::EntriesFilter
       REPLACEMENTS = {
         '$eq' => '=',
         '$colon' => ':',
@@ -75,12 +75,12 @@ module Docs
       # include the companion object.
       def package_name
         name = package_drop_last(slug_parts)
-        name.empty? ? '_root_' : name
+        name.empty? ? 'scala' : name
       end
 
       def parent_package
         parent = package_drop_last(package_name.split('.'))
-        parent.empty? ? '_root_' : parent
+        parent.empty? ? 'scala' : parent
       end
 
       def package_drop_last(parts)

+ 105 - 0
lib/docs/filters/scala/entries_v3.rb

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+
+module Docs
+  class Scala
+    class EntriesV3Filter < Docs::EntriesFilter
+      REPLACEMENTS = {
+        '$eq' => '=',
+        '$colon' => ':',
+        '$less' => '<',
+      }
+
+      def get_name
+        if is_package?
+          at_css('.cover-header h1').text
+        else
+          name = slug.split('/').last
+
+          # Some objects have inner objects, show ParentObject$.ChildObject$ instead of ParentObject$$ChildObject$
+          name = name.gsub('$$', '$.')
+
+          REPLACEMENTS.each do |key, value|
+            name = name.gsub(key, value)
+          end
+
+          # If a dollar sign is used as separator between two characters, replace it with a dot
+          name.gsub(/([^$.])\$([^$.])/, '\1.\2')
+        end
+      end
+
+      def get_type
+        # if this entry is for a package, we group the package under the parent package
+        if is_package?
+          parent_package
+        # otherwise, group it under the regular package name
+        else
+          package_name
+        end
+      end
+
+      def include_default_entry?
+        # Ignore package pages
+        at_css('.cover-header .micon.pa').nil?
+      end
+
+      def additional_entries
+        entries = []
+        titles = []
+
+        css(".documentableElement").each do |node|
+          # Ignore elements without IDs
+          id = node['id']
+          next if id.nil?
+
+          # Ignore deprecated and inherited members
+          next unless node.at_css('.deprecated').nil?
+
+          member_name = node.at_css('.documentableName').content
+          title = "#{name}.#{member_name}"
+          
+          # Add () to methods that take parameters, i.e. methods who have (…)
+          # in their signature, ignoring occurrences of (implicit …) and (using …)
+          signature = node.at_css('.signature').content
+          title += '()' if signature =~ /\((?!implicit)(?!using ).*\)/
+
+          next if titles.include?(title) # Ignore duplicates (function overloading)
+        
+          entries << [title, id]
+          titles.push(title)
+        end
+
+        entries
+      end
+
+      private
+
+      # For the package name, we use the slug rather than parsing the package
+      # name from the HTML because companion object classes may be broken out into
+      # their own entries (by the source documentation). When that happens,
+      # we want to group these classes (like `scala.reflect.api.Annotations.Annotation`)
+      # under the package name, and not the fully-qualfied name which would
+      # include the companion object.
+      def package_name
+        name = package_drop_last(slug_parts)
+        name.empty? ? 'scala' : name
+      end
+
+      def parent_package
+        parent = package_drop_last(package_name.split('.'))
+        parent.empty? ? 'scala' : parent
+      end
+
+      def package_drop_last(parts)
+        parts[0...-1].join('.')
+      end
+
+      def slug_parts
+        slug.split('/')
+      end
+
+      def is_package?
+        !at_css('.cover-header .micon.pa').nil?
+      end
+    end
+  end
+end

+ 38 - 9
lib/docs/scrapers/scala.rb

@@ -3,24 +3,50 @@ module Docs
     self.name = 'Scala'
     self.type = 'scala'
     self.links = {
-      home: 'http://www.scala-lang.org/',
+      home: 'https://www.scala-lang.org/',
       code: 'https://github.com/scala/scala'
     }
 
-    options[:container] = '#content-container'
     options[:attribution] = <<-HTML
-        &copy; 2002-2019 EPFL, with contributions from Lightbend.<br>
+        &copy; 2002-2022 EPFL, with contributions from Lightbend.<br>
         Licensed under the Apache License, Version 2.0.
     HTML
 
+    # For Scala 3, there is no official download link for the documentation
+    # (see https://contributors.scala-lang.org/t/5537).
+    #
+    # We currently need to build the docs ourselves. To do so:
+    # 1. Make sure that Scala 3 and sbt are installed
+    #    (https://www.scala-lang.org/download/scala3.html)
+    # 2. Clone the Scala 3 (Dotty) repository (https://github.com/lampepfl/dotty)
+    # 3. From the Dotty folder, run this command in the terminal:
+    #    $ sbt scaladoc/generateScalaDocumentation
+    # 4. Extract scaladoc/output/scala3/api/ into docs/scala~3.1
+    version '3.1' do
+      self.release = '3.1.1'
+      self.base_url = 'https://scala-lang.org/api/3.1.1/'
+      self.root_path = 'index.html'
+
+      options[:skip_patterns] = [
+        # Ignore class names with include “#”, which cause issues with the scraper
+        /%23/,
+
+        # Ignore local links to the Java documentation created by a Scaladoc bug
+        /java\/lang/,
+      ]
+
+      html_filters.push 'scala/entries_v3', 'scala/clean_html_v3'
+    end
+
     # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
     # Extract api/scala-library into docs/scala~2.13_library
     version '2.13 Library' do
       self.release = '2.13.0'
       self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
-      html_filters.push 'scala/entries', 'scala/clean_html'
+      html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
 
     # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
@@ -29,8 +55,9 @@ module Docs
       self.release = '2.13.0'
       self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
-      html_filters.push 'scala/entries', 'scala/clean_html'
+      html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
 
     # https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip
@@ -39,8 +66,9 @@ module Docs
       self.release = '2.12.9'
       self.base_url = 'https://www.scala-lang.org/api/2.12.9/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
-      html_filters.push 'scala/entries', 'scala/clean_html'
+      html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
 
     # https://downloads.lightbend.com/scala/2.12.9/scala-docs-2.12.9.zip
@@ -49,13 +77,14 @@ module Docs
       self.release = '2.12.9'
       self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
-      html_filters.push 'scala/entries', 'scala/clean_html'
+      html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
 
     def get_latest_version(opts)
-      doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
-      doc.at_css('#doc-version').content
+      doc = fetch_doc('https://www.scala-lang.org/api/3.x/', opts)
+      doc.at_css('.projectVersion').content
     end
   end
 end