Browse Source

duckdb docs (v1.1) - scrape v1

Scott Goley 1 year ago
parent
commit
eaec6ec43f

+ 41 - 0
lib/docs/filters/duckdb/clean_html.rb

@@ -0,0 +1,41 @@
+module Docs
+  class Duckdb
+    class CleanHtmlFilter < Filter
+      def call
+        # First extract the main content
+        @doc = at_css('main')
+        return doc if @doc.nil?
+
+        # Remove navigation and header elements
+        css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove
+
+        # Clean up code blocks
+        css('pre').each do |node|
+          # Detect language from class or parent div
+          if node['class']&.include?('sql') || node.at_css('code.sql')
+            node['data-language'] = 'sql'
+          elsif node['class']&.include?('language-sql')
+            node['data-language'] = 'sql'
+          end
+          node.content = node.content.strip
+        end
+
+        # Remove unnecessary attributes but keep essential ones
+        css('div, span, p').each do |node|
+          node.remove_attribute('style')
+          node.remove_attribute('class') unless node['class'] =~ /highlight/
+        end
+
+        # Remove empty elements
+        css('div, span').each do |node|
+          node.remove if node.content.strip.empty?
+        end
+
+        # Remove script tags
+        css('script').remove
+
+        doc
+      end
+    end
+  end
+end

+ 45 - 0
lib/docs/filters/duckdb/entries.rb

@@ -0,0 +1,45 @@
+module Docs
+  class Duckdb
+    class EntriesFilter < Docs::EntriesFilter
+      def get_name
+        at_css('h1')&.content || 'DuckDB'
+      end
+
+      def get_type
+        case subpath
+        when /\Asql\//
+          'SQL Reference'
+        when /\Aapi\//
+          'Client APIs'
+        when /\Aguides\//
+          'How-to Guides'
+        when /\Adata\//
+          'Data Import'
+        when /\Aoperations_manual\//
+          'Operations Manual'
+        when /\Adev\//
+          'Development'
+        when /\Ainternals\//
+          'Internals'
+        when /\Aextensions\//
+          'Extensions'
+        when /\Aarchive\//
+          'Archive'
+        else
+          'Documentation'
+        end
+      end
+
+      def additional_entries
+        entries = []
+        css('h2[id]', 'h3[id]').each do |node|
+          name = node.content.strip
+          # Clean up the name
+          name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
+          entries << [name, node['id'], get_type]
+        end
+        entries
+      end
+    end
+  end
+end

+ 69 - 0
lib/docs/scrapers/duckdb.rb

@@ -0,0 +1,69 @@
+module Docs
+  class Duckdb < UrlScraper
+    self.name = 'DuckDB'
+    self.type = 'duckdb'
+    self.root_path = 'index.html'
+    self.links = {
+      home: 'https://duckdb.org/',
+      code: 'https://github.com/duckdb/duckdb'
+    }
+
+    html_filters.push 'duckdb/entries', 'duckdb/clean_html'
+
+    options[:container] = '.documentation'
+    
+    options[:skip_patterns] = [
+      /installation/,
+      /archive/,
+      /reference/,
+    ]
+
+    options[:skip] = %w(
+      docs/archive/
+      docs/installation/
+      docs/api/
+    )
+
+    options[:attribution] = <<-HTML
+      &copy; Copyright 2018&ndash;2024 Stichting DuckDB Foundation<br>
+      Licensed under the MIT License.
+    HTML
+
+    version '1.1' do
+      self.release = '1.1.x'
+      self.base_url = 'http://localhost:8000/docs/'
+    end
+
+    # version '1.0' do
+    #     self.release = '1.0.x'
+    #     self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+    #     html_filters.push 'duckdb/clean_html'
+    # end
+
+    # version '0.9' do
+    #     self.release = '0.9.x'
+    #     self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+    #     html_filters.push 'duckdb/clean_html'
+    # end
+
+    # version '0.8' do
+    #     self.release = '0.8.x'
+    #     self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+    #     html_filters.push 'duckdb/clean_html'
+    # end
+
+    # version '0.7' do
+    #     self.release = '0.7.x'
+    #     self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
+
+    #     html_filters.push 'duckdb/clean_html'
+    # end
+
+    def get_latest_version(opts)
+      get_github_tags('duckdb', 'duckdb', opts)
+    end
+  end
+end