Browse Source

New scraper: graphviz (also known as DOT language)

Closes: https://trello.com/c/Jaa1vC24

This is my second scraper. It's almost in a good shape, but I left a
TODO comment because I couldn't figure out how to make UrlScraper
reliably scrape the website. I keep getting random errors.

Otherwise, it's ready for review.
Denilson Sá Maia 1 tháng trước cách đây
mục cha
commit
28748b9e3a

+ 78 - 1
assets/javascripts/vendor/prism.js

@@ -1,5 +1,5 @@
 /* PrismJS 1.30.0
-https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markdown+markup-templating+matlab+nginx+nim+nix+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+tcl+typescript+yaml+zig */
+https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+dot+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markdown+markup-templating+matlab+nginx+nim+nix+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+tcl+typescript+yaml+zig */
 /// <reference lib="WebWorker"/>
 
 var _self = (typeof window !== 'undefined')
@@ -2929,6 +2929,83 @@ Prism.languages.insertBefore('d', 'function', {
 
 }(Prism));
 
+// https://www.graphviz.org/doc/info/lang.html
+
+(function (Prism) {
+
+	var ID = '(?:' + [
+		// an identifier
+		/[a-zA-Z_\x80-\uFFFF][\w\x80-\uFFFF]*/.source,
+		// a number
+		/-?(?:\.\d+|\d+(?:\.\d*)?)/.source,
+		// a double-quoted string
+		/"[^"\\]*(?:\\[\s\S][^"\\]*)*"/.source,
+		// HTML-like string
+		/<(?:[^<>]|(?!<!--)<(?:[^<>"']|"[^"]*"|'[^']*')+>|<!--(?:[^-]|-(?!->))*-->)*>/.source
+	].join('|') + ')';
+
+	var IDInside = {
+		'markup': {
+			pattern: /(^<)[\s\S]+(?=>$)/,
+			lookbehind: true,
+			alias: ['language-markup', 'language-html', 'language-xml'],
+			inside: Prism.languages.markup
+		}
+	};
+
+	/**
+	 * @param {string} source
+	 * @param {string} flags
+	 * @returns {RegExp}
+	 */
+	function withID(source, flags) {
+		return RegExp(source.replace(/<ID>/g, function () { return ID; }), flags);
+	}
+
+	Prism.languages.dot = {
+		'comment': {
+			pattern: /\/\/.*|\/\*[\s\S]*?\*\/|^#.*/m,
+			greedy: true
+		},
+		'graph-name': {
+			pattern: withID(/(\b(?:digraph|graph|subgraph)[ \t\r\n]+)<ID>/.source, 'i'),
+			lookbehind: true,
+			greedy: true,
+			alias: 'class-name',
+			inside: IDInside
+		},
+		'attr-value': {
+			pattern: withID(/(=[ \t\r\n]*)<ID>/.source),
+			lookbehind: true,
+			greedy: true,
+			inside: IDInside
+		},
+		'attr-name': {
+			pattern: withID(/([\[;, \t\r\n])<ID>(?=[ \t\r\n]*=)/.source),
+			lookbehind: true,
+			greedy: true,
+			inside: IDInside
+		},
+		'keyword': /\b(?:digraph|edge|graph|node|strict|subgraph)\b/i,
+		'compass-point': {
+			pattern: /(:[ \t\r\n]*)(?:[ewc_]|[ns][ew]?)(?![\w\x80-\uFFFF])/,
+			lookbehind: true,
+			alias: 'builtin'
+		},
+		'node': {
+			pattern: withID(/(^|[^-.\w\x80-\uFFFF\\])<ID>/.source),
+			lookbehind: true,
+			greedy: true,
+			inside: IDInside
+		},
+		'operator': /[=:]|-[->]/,
+		'punctuation': /[\[\]{};,]/
+	};
+
+	Prism.languages.gv = Prism.languages.dot;
+
+}(Prism));
+
 Prism.languages.elixir = {
 	'doc': {
 		pattern: /@(?:doc|moduledoc)\s+(?:("""|''')[\s\S]*?\1|("|')(?:\\(?:\r\n|[\s\S])|(?!\2)[^\\\r\n])*\2)/,

+ 44 - 0
lib/docs/filters/graphviz/clean_html.rb

@@ -0,0 +1,44 @@
+module Docs
+  class Graphviz
+    class CleanHtmlFilter < Filter
+      def call
+        css('[tabindex]').remove_attribute('tabindex')
+
+        content = at_css('.td-content')
+        @doc = content if content
+
+        css('pre:has(code)').each do |node|
+          pre = Nokogiri::XML::Node.new('pre', @doc)
+          code = node.at_css('code')
+
+          if code['data-lang']
+            # Syntax highlighting is embedded into this HTML markup.
+            pre['data-language'] = code['data-lang']
+          else
+            # Plain example source-code without highlighting.
+            # Let's guess the language.
+            sourcecode = code.content.strip
+            if sourcecode =~ /^\$/
+              # Starts with '$'? Probably a shell session.
+              pre['data-language'] = 'shell-session'
+            elsif sourcecode =~ /^cmd /
+              # Command line example. No highlighting needed.
+              pre['data-language'] = ''
+            elsif sourcecode =~ /^void /
+              # C language.
+              pre['data-language'] = 'c'
+            else
+              # Nothing else? Let's guess DOT.
+              pre['data-language'] = 'dot'
+            end
+          end
+          pre.content = code.content
+
+          node.replace(pre)
+        end
+
+        doc
+      end
+    end
+  end
+end

+ 28 - 0
lib/docs/filters/graphviz/entries.rb

@@ -0,0 +1,28 @@
+module Docs
+  class Graphviz
+    class EntriesFilter < Docs::EntriesFilter
+
+      def get_name
+        name = at_css('h1').content.strip
+      end
+
+      def get_type
+        breadcrumbs = css('nav ol.breadcrumb li.breadcrumb-item')
+        category = breadcrumbs[1]&.content&.strip
+
+        # These categories have several sub-pages.
+        return category if [
+          'Attribute Types',
+          'Attributes',
+          'Command Line',
+          'Layout Engines',
+          'Output Formats',
+        ].include?(category)
+
+        # Several categories have only one page each. Let's group them together.
+        return 'Documentation'
+      end
+
+    end
+  end
+end

+ 55 - 0
lib/docs/scrapers/graphviz.rb

@@ -0,0 +1,55 @@
+module Docs
+  class Graphviz < UrlScraper
+    self.name = 'Graphviz'
+    self.slug = 'graphviz'
+    self.type = 'simple'
+
+    self.links = {
+      home: 'https://www.graphviz.org/',
+      code: 'https://gitlab.com/graphviz/graphviz'
+    }
+
+    options[:container] = 'main'
+
+    # These images are too large:
+    # 980KB https://www.graphviz.org/doc/info/plugins.png
+    # 650KB https://www.graphviz.org/Gallery/twopi/twopi2.svg
+    # All other files are under 100KB
+    options[:max_image_size] = 100_000
+
+    # TODO: the UrlScraper is very unreliable on this website.
+    # I often get several errors:
+    # - SSL connect error
+    # - Failure when receiving data from the peer
+    # - was slow to process (30s)
+    # Setting a :rate_limit doesn't help.
+    # We have to figure out a more reliable solution.
+    #options[:rate_limit] = 100
+
+    options[:attribution] = <<-HTML
+      &copy; 2025 The Graphviz Authors<br>
+      Licensed under the Eclipse Public License 1.0.
+    HTML
+
+    html_filters.push 'graphviz/entries', 'graphviz/clean_html'
+
+    self.release = '14.01'
+    self.base_url = 'https://www.graphviz.org/'
+    self.root_path = 'documentation/'
+    options[:only_patterns] = [
+      /^documentation\//,
+      /^doc\//,
+      /^docs\//,
+    ]
+    options[:replace_paths] = {
+      # Redirections:
+      'docs/outputs/cmap/' => 'docs/outputs/imap/',
+      'doc/info/output.html' => 'docs/outputs/',
+    }
+
+    def get_latest_version(opts)
+      tags = get_gitlab_tags('gitlab.com', 'graphviz', 'graphviz', opts)
+      tags[0]['name']
+    end
+  end
+end

BIN
public/icons/docs/graphviz/16.png


BIN
public/icons/docs/graphviz/16@2x.png


+ 1 - 0
public/icons/docs/graphviz/SOURCE

@@ -0,0 +1 @@
+https://gitlab.com/graphviz/graphviz.gitlab.io/-/blob/main/static/Resources/favicon.png