Browse Source

feat(pytorch): Add PyTorch 2.8 and 2.9 documentation support

This commit updates the PyTorch scraper for documentation versions 2.8
and 2.9, addressing changes in the theme and HTML structure.

Key changes:
- Identifies the main content area correctly in newer version docs.
- Supports the new breadcrumb navigation structure.
- Restore truncated entry names in newer docs using the full page title,
maintaining consistent naming conventions.
Wei Zhang 2 weeks ago
parent
commit
190ffb57b2

+ 5 - 3
lib/docs/filters/pytorch/clean_html.rb

@@ -2,9 +2,11 @@ module Docs
   class Pytorch
     class CleanHtmlFilter < Filter
       def call
-        @doc = at_css('.pytorch-article')
-        # Show katex-mathml nodes and remove katex-html nodes
-        css('.katex-html').remove
+        if root = at_css('#pytorch-article')
+          @doc = root
+          # Show katex-mathml nodes and remove katex-html nodes
+          css('.katex-html').remove
+        end
         doc
       end
     end

+ 26 - 4
lib/docs/filters/pytorch/entries.rb

@@ -2,9 +2,23 @@ module Docs
   class Pytorch
     class EntriesFilter < Docs::EntriesFilter
       def get_breadcrumbs
-        css('.pytorch-breadcrumbs > li').map {
-          |node| node.content.delete_suffix(' >').strip
-        }.reject { |item| item.nil? || item.empty? }
+        breadcrumbs = if at_css('.pytorch-breadcrumbs')
+          css('.pytorch-breadcrumbs > li').map { |node|
+            node.content.delete_suffix(' >').strip
+          }
+        else
+          css('.bd-breadcrumbs > li').map { |node|
+            text = node.content.strip
+            text.empty? && node.at_css('.fa-home') ? 'Docs' : text
+          }
+        end.reject { |item| item.nil? || item.empty? }
+
+        if breadcrumbs.last&.end_with?('.')
+          resolved_name = at_css('h1').content.delete_suffix('#').strip
+          breadcrumbs[-1] = resolved_name
+        end
+
+        breadcrumbs
       end
 
       def get_name
@@ -12,7 +26,15 @@ module Docs
       end
 
       def get_type
-        get_breadcrumbs[1]
+        if at_css('.pytorch-breadcrumbs')
+          get_breadcrumbs[1]
+        else
+          get_breadcrumbs.size > 2 ? get_breadcrumbs[2] : get_breadcrumbs[1]
+        end
+      end
+
+      def include_default_entry?
+        !get_breadcrumbs.nil? && get_breadcrumbs.size >= 2
       end
 
       def additional_entries

+ 10 - 0
lib/docs/scrapers/pytorch.rb

@@ -19,6 +19,16 @@ module Docs
     PyTorch has a BSD-style license, as found in the <a href="https://github.com/pytorch/pytorch/blob/main/LICENSE">LICENSE</a> file.
     HTML
 
+    version '2.9' do
+      self.release = '2.9'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.8' do
+      self.release = '2.8'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
     version '2.7' do
       self.release = '2.7'
       self.base_url = "https://docs.pytorch.org/docs/#{release}/"