Browse Source

Add POC scraper/cleaner for Angular v17+ docs

Add version numbers; add redirect handler.

Add Angular v20.

Move HTML filters out of base Angular config.

Add CLI section.
spamguy 7 months ago
parent
commit
72564a8f9e

+ 54 - 0
lib/docs/filters/angular/clean_html_v18.rb

@@ -0,0 +1,54 @@
+module Docs
+  class Angular
+    class CleanHtmlV18Filter < Filter
+      def call
+        @doc = at_css('.docs-viewer') if at_css('.docs-viewer')
+
+        # Extract <h1> from decorative header.
+        @doc.prepend_child(at_css('h1'))
+        css('h1[class]').remove_attr('class')
+
+        css(
+          '.docs-breadcrumb',
+          '.docs-github-links',
+          'docs-table-of-contents',
+          '.docs-reference-category',
+          '.docs-reference-title',
+          '#jump-msg'
+        ).remove
+
+        # Strip anchor links from headers.
+        css('h2', 'h3', 'h4').each do |node|
+          node.content = node.inner_text
+        end
+
+        # Make every <code> block a <pre>.
+        css('.docs-code > pre > code').each do |code|
+          code.name = 'pre'
+          code['data-language'] = 'ts'
+          code.content = code.css('.line').map(&:content).join("\n")
+          code.parent.parent.replace(code)
+        end
+
+        # Better format content in CLI reference.
+        css('.docs-ref-content').each do |ref|
+          option = ref.at_css('.docs-reference-option code')
+          option.name = 'h3'
+          option.parent.replace(option)
+        end
+
+        css('.docs-reference-type-and-default', '.docs-reference-option-aliases').each do |node|
+          labels = node.css('span')
+          values = node.css('code')
+          labels.each do |l|
+            l.name = 'h4'
+          end
+        end
+
+        css('footer').remove
+
+        doc
+      end
+    end
+  end
+end

+ 2 - 0
lib/docs/filters/angular/entries.rb

@@ -10,6 +10,8 @@ module Docs
       def get_type
         if slug.start_with?('guide')
           'Guide'
+        elsif slug.start_with?('cli')
+          'CLI'
         elsif slug.start_with?('tutorial')
           'Tutorial'
         elsif slug.start_with?('api/platform-browser-dynamic')

+ 92 - 19
lib/docs/scrapers/angular.rb

@@ -4,32 +4,30 @@ module Docs
   class Angular < UrlScraper
     self.type = 'angular'
     self.links = {
-      home: 'https://angular.io/',
+      home: 'https://angular.dev/',
       code: 'https://github.com/angular/angular'
     }
     self.base_url = 'https://angular.io/'
     self.root_path = 'docs'
 
-    html_filters.push 'angular/clean_html', 'angular/entries'
-
     options[:max_image_size] = 256_000
 
     options[:attribution] = <<-HTML
-      &copy; 2010&ndash;2023 Google, Inc.<br>
-      Licensed under the Creative Commons Attribution License 4.0.
+      Super-powered by Google &copy;2010&ndash;2025.<br />
+      Code licensed under an MIT-style License. Documentation licensed under CC BY 4.0.
     HTML
 
     options[:follow_links] = false
-    options[:only_patterns] = [/\Aguide/, /\Atutorial/, /\Aapi/]
+    options[:only_patterns] = [/\Aguide/, /\Aapi/, /\Acli/]
     options[:fix_urls_before_parse] = ->(url) do
       url.sub! %r{\Aguide/}, '/guide/'
-      url.sub! %r{\Atutorial/}, '/tutorial/'
       url.sub! %r{\Aapi/}, '/api/'
+      url.sub! %r{\cli/}, '/cli/'
       url.sub! %r{\Agenerated/}, '/generated/'
       url
     end
 
-    module Common
+    module JsonNavigation
       private
 
       def initial_urls
@@ -79,91 +77,166 @@ module Docs
         path = path.gsub(/[A-Z_]/) {|s| s.downcase + '_'}
         super
       end
-      include Docs::Angular::Common
+      include Docs::Angular::JsonNavigation
+    end
+
+    module Since18
+      def self.handle_redirects(version)
+        lambda do |url|
+          url.sub! '/guide/templates/reference-variables', '/guide/templates/variables#template-reference-variables'
+          url.sub! '/guide/signals/inputs', '/guide/components/inputs'
+          url.sub! '/guide/defer', '/guide/templates/defer'
+          url.sub! '/guide/templates/class-binding', '/guide/templates/binding#css-class-and-style-property-bindings'
+          url.sub! %r{/guide/components$}, '/guide/components/anatomy-of-components'
+          url.sub! '/guide/templates/property-binding', '/guide/templates/binding#binding-dynamic-properties-and-attributes'
+          url.sub! %r{/guide/ngmodules$}, '/guide/ngmodules/overview'
+          url.sub! '/guide/components/importing', '/guide/components/anatomy-of-components#using-components'
+
+          url.sub! '/guide/components/anatomy-of-components', '/guide/components' if version == '20'
+
+          url
+        end
+      end
+    end
+
+    version '20' do
+      self.release = '20.3.4'
+      self.base_url = 'https://angular.dev/'
+      self.root_path = 'overview'
+
+      options[:follow_links] = true
+      options[:container] = '.docs-app-main-content'
+      options[:fix_urls] = Since18.handle_redirects(self.version)
+
+      html_filters.push 'angular/entries', 'angular/clean_html_v18'
+
+      include Docs::Angular::Since18
+    end
+
+    version '19' do
+      self.release = '19.2.15'
+      self.base_url = 'https://v19.angular.dev/'
+      self.root_path = 'overview'
+
+      options[:follow_links] = true
+      options[:container] = '.docs-app-main-content'
+      options[:fix_urls] = Since18.handle_redirects(self.version)
+
+      html_filters.push 'angular/entries', 'angular/clean_html_v18'
+
+      include Docs::Angular::Since18
+    end
+
+    version '18' do
+      self.release = '18.2.14'
+      self.base_url = 'https://v18.angular.dev/'
+      self.root_path = 'overview'
+
+      options[:follow_links] = true
+      options[:container] = '.docs-app-main-content'
+      options[:fix_urls] = Since18.handle_redirects(self.version)
+
+      html_filters.push 'angular/entries', 'angular/clean_html_v18'
+
+      include Docs::Angular::Since18
     end
 
-    version do
+    version '17' do
       self.release = '17.0.8'
-      self.base_url = 'https://angular.io/'
+      self.base_url = 'https://v17.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '16' do
       self.release = '16.2.12'
       self.base_url = 'https://v16.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '15' do
       self.release = '15.2.9'
       self.base_url = 'https://v15.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '14' do
       self.release = '14.2.12'
       self.base_url = 'https://v14.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '13' do
       self.release = '13.3.8'
       self.base_url = 'https://v13.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '12' do
       self.release = '12.2.13'
       self.base_url = 'https://v12.angular.io/'
+      html_filters.push 'angular/clean_html', 'angular/entries'
       include Docs::Angular::Since12
     end
 
     version '11' do
       self.release = '11.2.14'
       self.base_url = 'https://v11.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '10' do
       self.release = '10.2.3'
       self.base_url = 'https://v10.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '9' do
       self.release = '9.1.12'
       self.base_url = 'https://v9.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '8' do
       self.release = '8.2.14'
       self.base_url = 'https://v8.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '7' do
       self.release = '7.2.15'
       self.base_url = 'https://v7.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '6' do
       self.release = '6.1.10'
       self.base_url = 'https://v6.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '5' do
       self.release = '5.2.11'
       self.base_url = 'https://v5.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '4' do
       self.release = '4.4.6'
       self.base_url = 'https://v4.angular.io/'
-      include Docs::Angular::Common
+      html_filters.push 'angular/clean_html', 'angular/entries'
+      include Docs::Angular::JsonNavigation
     end
 
     version '2' do