ソースを参照

Update pytorch scraper, include various 2.x versions

1. Various 2.x versions are included separately. Pytorch versions are
not backward compatible, it has different compatibilities between CUDA
etc, so people may use specific versions for a extended period of
time.

2. Removed the type replacement table for `get_type`. Instead, get the
type from breadcrumbs directly. IMO this produces better results that
matches the index in the original website (the left side menu in
docs.python.org). Also, the `TYPE_REPLACEMENT` table was opiniated and
hard to maintain across versions.

3. Always include default entry (removed `include_default_entry?`
function). I don't see the downside of this. Previously some pages are
missing because of this (e.g. torchrun https://docs.pytorch.org/docs/1.13/elastic/run.html)
Yikai Zhao 6 ヶ月 前
コミット
f4e2ebd23b
2 ファイル変更43 行追加83 行削除
  1. 2 78
      lib/docs/filters/pytorch/entries.rb
  2. 41 5
      lib/docs/scrapers/pytorch.rb

+ 2 - 78
lib/docs/filters/pytorch/entries.rb

@@ -1,73 +1,6 @@
 module Docs
   class Pytorch
     class EntriesFilter < Docs::EntriesFilter
-      TYPE_REPLACEMENTS = {
-        "torch.Tensor" => "Tensor",
-        "torch.nn" => "Neuro Network",
-        "Probability distributions - torch.distributions" => "Probability Distributions",
-        "torch" => "Torch",
-        "Quantization" => "Quantization",
-        "torch.optim" => "Optimization",
-        "torch.Storage" => "Storage",
-        "torch.nn.functional" => "NN Functions",
-        "torch.cuda" => "CUDA",
-        "Torch Distributed Elastic" => "Distributed Elastic",
-        "torch.fx" => "FX",
-        "TorchScript" => "Torch Script",
-        "torch.onnx" => "ONNX",
-        "Distributed communication package - torch.distributed" => "Distributed Communication",
-        "Automatic differentiation package - torch.autograd" => "Automatic Differentiation",
-        "torch.linalg" => "Linear Algebra",
-        "Distributed Checkpoint - torch.distributed.checkpoint" => "Distributed Checkpoint",
-        "Distributed RPC Framework" => "Distributed RPC",
-        "torch.special" => "SciPy-like Special",
-        "torch.package" => "Package",
-        "torch.backends" => "Backends",
-        "FullyShardedDataParallel" => "Fully Sharded Data Parallel",
-        "torch.sparse" => "Sparse Tensors",
-        "torch.export" => "Traced Graph Export",
-        "torch.fft" => "Discrete Fourier Transforms",
-        "torch.utils.data" => "Datasets and Data Loaders",
-        "torch.monitor" => "Monitor",
-        "Automatic Mixed Precision package - torch.amp" => "Automatic Mixed Precision",
-        "torch.utils.tensorboard" => "Tensorboard",
-        "torch.profiler" => "Profiler",
-        "torch.mps" => "MPS",
-        "DDP Communication Hooks" => "DDP Communication Hooks",
-        "Benchmark Utils - torch.utils.benchmark" => "Benchmark Utils",
-        "torch.nn.init" => "Parameter Initializations",
-        "Tensor Parallelism - torch.distributed.tensor.parallel" => "Tensor Parallelism",
-        "torch.func" => "JAX-like Function Transforms",
-        "Distributed Optimizers" => "Distributed Optimizers",
-        "torch.signal" => "SciPy-like Signal",
-        "torch.futures" => "Miscellaneous",
-        "torch.utils.cpp_extension" => "Miscellaneous",
-        "torch.overrides" => "Miscellaneous",
-        "Generic Join Context Manager" => "Miscellaneous",
-        "torch.hub" => "Miscellaneous",
-        "torch.cpu" => "Miscellaneous",
-        "torch.random" => "Miscellaneous",
-        "torch.compiler" => "Miscellaneous",
-        "Pipeline Parallelism" => "Miscellaneous",
-        "Named Tensors" => "Miscellaneous",
-        "Multiprocessing package - torch.multiprocessing" => "Miscellaneous",
-        "torch.utils" => "Miscellaneous",
-        "torch.library" => "Miscellaneous",
-        "Tensor Attributes" => "Miscellaneous",
-        "torch.testing" => "Miscellaneous",
-        "torch.nested" => "Miscellaneous",
-        "Understanding CUDA Memory Usage" => "Miscellaneous",
-        "torch.utils.dlpack" => "Miscellaneous",
-        "torch.utils.checkpoint" => "Miscellaneous",
-        "torch.__config__" => "Miscellaneous",
-        "Type Info" => "Miscellaneous",
-        "torch.utils.model_zoo" => "Miscellaneous",
-        "torch.utils.mobile_optimizer" => "Miscellaneous",
-        "torch._logging" => "Miscellaneous",
-        "torch.masked" => "Miscellaneous",
-        "torch.utils.bottleneck" => "Miscellaneous"
-      }
-
       def get_breadcrumbs
         css('.pytorch-breadcrumbs > li').map {
           |node| node.content.delete_suffix(' >').strip
@@ -75,18 +8,11 @@ module Docs
       end
 
       def get_name
-        b = get_breadcrumbs
-        b[(b[1] == 'torch' ? 2 : 1)..].join('.')
+        get_breadcrumbs[-1]
       end
 
       def get_type
-        t = get_breadcrumbs[1]
-        TYPE_REPLACEMENTS.fetch(t, t)
-      end
-
-      def include_default_entry?
-        # Only include API entries to simplify and unify the list
-        return name.start_with?('torch.')
+        get_breadcrumbs[1]
       end
 
       def additional_entries
@@ -108,8 +34,6 @@ module Docs
             entries << [id + '()', id]
           when 'py class', 'py attribute', 'py property'
             entries << [id, id]
-          when 'footnote brackets', 'field-list simple'
-            next
           end
         end
 

+ 41 - 5
lib/docs/scrapers/pytorch.rb

@@ -12,20 +12,56 @@ module Docs
 
     options[:skip] = ['cpp_index.html', 'deploy.html', 'packages.html', 'py-modindex.html', 'genindex.html']
     options[:skip_patterns] = [/\Acommunity/, /\A_modules/, /\Anotes/, /\Aorg\/pytorch\//]
+    options[:max_image_size] = 1_000_000
 
     options[:attribution] = <<-HTML
-    &copy; 2024, PyTorch Contributors<br>
+    &copy; 2025, PyTorch Contributors<br>
     PyTorch has a BSD-style license, as found in the <a href="https://github.com/pytorch/pytorch/blob/main/LICENSE">LICENSE</a> file.
     HTML
 
-    version '2' do
+    version '2.7' do
+      self.release = '2.7'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.6' do
+      self.release = '2.6'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.5' do
+      self.release = '2.5'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.4' do
+      self.release = '2.4'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.3' do
+      self.release = '2.3'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.2' do
+      self.release = '2.2'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.1' do
       self.release = '2.1'
-      self.base_url = "https://pytorch.org/docs/#{release}/"
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
+    end
+
+    version '2.0' do
+      self.release = '2.0'
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
     end
 
-    version '1' do
+    version '1.13' do
       self.release = '1.13'
-      self.base_url = "https://pytorch.org/docs/#{release}/"
+      self.base_url = "https://docs.pytorch.org/docs/#{release}/"
     end
 
     def get_latest_version(opts)