Browse Source

mariadb: update scraper and filters to work with the official website

Jasper van Merle 6 years ago
parent
commit
93582d3b82

+ 1 - 1
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -439,7 +439,7 @@ credits = [
     'http://www.gnu.org/copyleft/fdl.html'
     'http://www.gnu.org/copyleft/fdl.html'
   ], [
   ], [
     'MariaDB',
     'MariaDB',
-    '2018 MariaDB',
+    '2019 MariaDB',
     'CC BY-SA & GFDL',
     'CC BY-SA & GFDL',
     'https://mariadb.com/kb/en/library/documentation/+license/'
     'https://mariadb.com/kb/en/library/documentation/+license/'
   ], [
   ], [

+ 1 - 1
assets/stylesheets/pages/_mariadb.scss

@@ -1,7 +1,7 @@
 ._mariadb {
 ._mariadb {
   @extend %simple;
   @extend %simple;
 
 
-  .graybox {
+  .graybox, .product {
     @extend %note;
     @extend %note;
   }
   }
 }
 }

+ 16 - 25
lib/docs/filters/mariadb/clean_html.rb

@@ -1,11 +1,10 @@
-require 'net/http'
-
 module Docs
 module Docs
   class Mariadb
   class Mariadb
     class CleanHtmlFilter < Filter
     class CleanHtmlFilter < Filter
-      @@known_urls = Hash.new
-
       def call
       def call
+        # Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped
+        return doc if doc.inner_html == ''
+
         # Extract main content
         # Extract main content
         @doc = at_css('#content')
         @doc = at_css('#content')
 
 
@@ -21,19 +20,6 @@ module Docs
           node['data-language'] = 'sql'
           node['data-language'] = 'sql'
         end
         end
 
 
-        # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page
-        css('a').each do |node|
-          url = node['href']
-
-          if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url)
-            final_url = get_final_url(url)
-
-            if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/')
-              node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index"
-            end
-          end
-        end
-
         # Fix images
         # Fix images
         css('img').each do |node|
         css('img').each do |node|
           node['src'] = node['src'].sub('http:', 'https:')
           node['src'] = node['src'].sub('http:', 'https:')
@@ -46,11 +32,11 @@ module Docs
           end
           end
         end
         end
 
 
-        # Convert listings (pages like http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
+        # Convert listings (pages like https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
         css('ul.listing').each do |node|
         css('ul.listing').each do |node|
           rows = []
           rows = []
 
 
-          node.css('li').each do |li|
+          node.css('li:not(.no_data)').each do |li|
             name = li.at_css('.media-heading').content
             name = li.at_css('.media-heading').content
             description = li.at_css('.blurb').content
             description = li.at_css('.blurb').content
             url = li.at_css('a')['href']
             url = li.at_css('a')['href']
@@ -61,15 +47,20 @@ module Docs
           node.replace(table)
           node.replace(table)
         end
         end
 
 
-        doc
-      end
+        # Turn note titles into <strong> tags
+        css('.product_title').each do |node|
+          node.name = 'strong'
+        end
 
 
-      def get_final_url(url)
-        unless @@known_urls.has_key?(url)
-          @@known_urls[url] = Net::HTTP.get_response(URI(url))['location']
+        # Remove comments and questions
+        css('.related_questions, #comments').remove
+        css('h2').each do |node|
+          if node.content == 'Comments'
+            node.remove
+          end
         end
         end
 
 
-        @@known_urls[url]
+        doc
       end
       end
     end
     end
   end
   end

+ 13 - 3
lib/docs/filters/mariadb/entries.rb

@@ -2,12 +2,22 @@ module Docs
   class Mariadb
   class Mariadb
     class EntriesFilter < Docs::EntriesFilter
     class EntriesFilter < Docs::EntriesFilter
       def get_name
       def get_name
-        at_css('.container > h1').content.strip
+        return 'Name' if doc.inner_html == ''
+
+        at_css('#content > h1').content.strip
       end
       end
 
 
       def get_type
       def get_type
-        link = at_css('#breadcrumbs > a:nth-child(6)')
-        link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content
+        return 'Type' if doc.inner_html == ''
+
+        link = at_css('#breadcrumbs > a:nth-child(4)')
+        link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content
+      end
+
+      def entries
+        # Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be scraped
+        return [] if doc.inner_html == ''
+        super
       end
       end
     end
     end
   end
   end

+ 34 - 0
lib/docs/filters/mariadb/erase_invalid_pages.rb

@@ -0,0 +1,34 @@
+module Docs
+  class Mariadb
+    class EraseInvalidPagesFilter < Filter
+      @@seen_urls = Hash.new
+
+      def call
+        # The MariaDB documentation uses urls like mariadb.com/kb/en/*
+        # This means there is no way to detect if a page should be scraped based on it's url
+        # We run this filter before the internal_urls filter scrapes all internal urls
+        # If this page should not be scraped, we erase it's contents in here so that the internal urls are not picked up
+        # The entries filter will make sure that no entry is saved for this page
+
+        if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
+          doc.inner_html = ''
+        end
+
+        current_page = at_css('a.crumb.node_link')
+        unless current_page.nil?
+          url = current_page['href']
+
+          # Some links lead to the same page
+          # Only parse the page one time
+          if @@seen_urls.has_key?(url)
+            doc.inner_html = ''
+          end
+
+          @@seen_urls[url] = true
+        end
+
+        doc
+      end
+    end
+  end
+end

+ 19 - 6
lib/docs/scrapers/mariadb.rb

@@ -2,21 +2,34 @@ module Docs
   class Mariadb < UrlScraper
   class Mariadb < UrlScraper
     self.name = 'MariaDB'
     self.name = 'MariaDB'
     self.type = 'mariadb'
     self.type = 'mariadb'
-    self.release = '10.3.8'
-    self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/'
+    self.release = '10.4.7'
+    self.base_url = 'https://mariadb.com/kb/en/'
+    self.root_path = 'library/documentation/'
     self.links = {
     self.links = {
       home: 'https://mariadb.com/',
       home: 'https://mariadb.com/',
       code: 'https://github.com/MariaDB/server'
       code: 'https://github.com/MariaDB/server'
     }
     }
 
 
-    html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title'
+    html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages'
+    html_filters.push 'mariadb/entries', 'mariadb/clean_html'
 
 
-    options[:download_images] = false
-    options[:root_title] = 'MariaDB'
+    options[:skip_patterns] = [
+      /\+/,
+      /\/ask\//,
+      /-release-notes\//,
+      /-changelog\//,
+      /^documentation\//,
+      /^mariadb-server-documentation\//,
+    ]
 
 
     options[:attribution] = <<-HTML
     options[:attribution] = <<-HTML
-      &copy; 2018 MariaDB<br>
+      &copy; 2019 MariaDB<br>
       Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License.
       Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation License.
     HTML
     HTML
+
+    def get_latest_version(opts)
+      doc = fetch_doc('https://mariadb.com/downloads/', opts)
+      doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0]
+    end
   end
   end
 end
 end