瀏覽代碼

Merge pull request #2512 from spamguy/github-scraper

Fix GitHub scraper
Simon Legner 6 月之前
父節點
當前提交
4676f84b6f

+ 5 - 0
lib/docs/filters/github/clean_html.rb

@@ -2,6 +2,11 @@ module Docs
   class Github
     class CleanHtmlFilter < Filter
       def call
+        # Remove h1 wrapper to render it correctly.
+        css('.markdown-heading h1').each do |node|
+          node.parent.replace(node)
+        end
+
         css('.anchor').each do |node|
           node.parent['id'] = node['href'].remove('#')
           node.remove

+ 2 - 2
lib/docs/filters/nginx_lua_module/entries.rb

@@ -4,11 +4,11 @@ module Docs
       def additional_entries
         entries = []
 
-        css('#directives + ul > li > a').each do |node|
+        css('h2:contains("Directives") + ul > li > a').each do |node|
           entries << [node.content, node['href'].remove('#'), 'Directives']
         end
 
-        css('#nginx-api-for-lua + ul > li > a').each do |node|
+        css('h2:contains("Nginx API for Lua") + ul > li > a').each do |node|
           next if node.content == 'Introduction'
           entries << [node.content, node['href'].remove('#'), 'Nginx API for Lua']
         end

+ 3 - 1
lib/docs/filters/sanctuary_def/entries.rb

@@ -39,8 +39,10 @@ module Docs
           when "h3"
             type = node.text
           when "h4"
+            # Parent <div>'s ID set in github/clean_html.
+            id = node.parent.attributes["id"].value
             name = node.text.split(' :: ')[0]
-            id = node.attributes["id"].value
+
             entries << [name, id, type]
           end
         end

+ 0 - 5
lib/docs/filters/sanctuary_type_classes/clean_html.rb

@@ -8,11 +8,6 @@ module Docs
           node.name = 'h3'
         }
 
-        # correct and unify link ids
-        css('h3').each { |node|
-          node.attributes["id"].value = node.text.split(' :: ')[0]
-        }
-
         doc
       end
     end

+ 2 - 2
lib/docs/filters/sanctuary_type_classes/entries.rb

@@ -38,9 +38,9 @@ module Docs
           case node.name
           when "h2"
             type = node.text
-            if node.attributes["id"].value == "type-class-hierarchy"
+            if node.parent.attributes["id"]&.value == "type-class-hierarchy"
               name = node.text
-              id = node.attributes["id"].value
+              id = node.parent.attributes["id"].value
               entries << [name, id, type]
             end
           when "h4"

+ 7 - 1
lib/docs/scrapers/github.rb

@@ -16,7 +16,13 @@ module Docs
     end
 
     def parse(response)
-      parsed = JSON.parse(response.response_body)
+      embedded_json = response
+        .response_body
+        .match(/react-app\.embeddedData">(.+?)<\/script>/)
+        &.captures
+        &.first
+      parsed = JSON.parse(embedded_json)
+
       [parsed['payload']['blob']['richText'], parsed['title']]
     end
   end

+ 32 - 9
lib/docs/scrapers/koa.rb

@@ -2,10 +2,6 @@
 
 module Docs
   class Koa < Github
-    self.base_url = 'https://github.com/koajs/koa/tree/master/docs'
-    self.release = '2.15.0'
-
-    self.root_path = 'api/index.md'
     self.initial_paths = %w[
       error-handling
       faq
@@ -26,20 +22,47 @@ module Docs
 
     html_filters.push 'koa/clean_html', 'koa/entries'
 
-    options[:skip] = %w[middleware.gif]
+    options[:skip_patterns] = [/\.gif/]
     options[:trailing_slash] = false
     options[:container] = '.markdown-body'
 
-    options[:fix_urls] = ->(url) do
-      url.sub! 'https://koajs.com/#error-handling', Koa.base_url + '/error-handling.md'
-      url
-    end
+
 
     options[:attribution] = <<-HTML
       &copy; 2020 Koa contributors<br>
       Licensed under the MIT License.
     HTML
 
+    version do
+      self.base_url = 'https://github.com/koajs/koa/blob/v3.0.0/docs'
+      self.root_path = 'api/index.md'
+      self.release = '3.0.0'
+      options[:fix_urls] = ->(url) do
+        url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
+        url
+      end
+    end
+    
+    version '2' do
+      self.base_url = 'https://github.com/koajs/koa/blob/v2.16.1/docs'
+      self.root_path = 'api/index.md'
+      self.release = '2.16.1'
+      options[:fix_urls] = ->(url) do
+        url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
+        url
+      end
+    end
+    
+    version '1' do
+      self.base_url = 'https://github.com/koajs/koa/blob/1.7.1/docs'
+      self.root_path = 'api/index.md'
+      self.release = '1.7.1'
+      options[:fix_urls] = ->(url) do
+        url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
+        url
+      end
+    end
+    
     def get_latest_version(opts)
       get_npm_version('koa', opts)
     end

+ 7 - 5
lib/docs/scrapers/nginx_lua_module.rb

@@ -2,8 +2,9 @@ module Docs
   class NginxLuaModule < Github
     self.name = 'nginx / Lua Module'
     self.slug = 'nginx_lua_module'
-    self.release = '0.10.13'
-    self.base_url = "https://github.com/openresty/lua-nginx-module/tree/v#{self.release}/"
+    self.release = '0.10.28'
+    self.base_url = "https://github.com/openresty/lua-nginx-module/blob/v#{self.release}/"
+    self.root_path = 'README.markdown'
     self.links = {
       code: 'https://github.com/openresty/lua-nginx-module'
     }
@@ -11,13 +12,14 @@ module Docs
     html_filters.push 'nginx_lua_module/clean_html', 'nginx_lua_module/entries', 'title'
 
     options[:root_title] = 'ngx_http_lua_module'
-    options[:container] = '#readme > article'
-
+    options[:container] = '.markdown-body'
+    options[:max_image_size] = 256_000
     options[:attribution] = <<-HTML
       &copy; 2009&ndash;2017 Xiaozhe Wang (chaoslawful)<br>
-      &copy; 2009&ndash;2018 Yichun "agentzh" Zhang (章亦春), OpenResty Inc.<br>
+      &copy; 2009&ndash;2019 Yichun "agentzh" Zhang (章亦春), OpenResty Inc.<br>
       Licensed under the BSD License.
     HTML
+    options[:skip_patterns] = [/\.png/]
 
     def get_latest_version(opts)
       tags = get_github_tags('openresty', 'lua-nginx-module', opts)

+ 2 - 2
lib/docs/scrapers/q.rb

@@ -1,5 +1,5 @@
 module Docs
-  class Q < Github
+  class Q < UrlScraper
     self.name = 'Q'
     self.release = '1.5.1'
     self.base_url = 'https://github.com/kriskowal/q/wiki/'
@@ -16,7 +16,7 @@ module Docs
     options[:skip_links] = true
 
     options[:attribution] = <<-HTML
-      &copy; 2009&ndash;2017 Kristopher Michael Kowal<br>
+      &copy; 2009&ndash;2018 Kristopher Michael Kowal<br>
       Licensed under the MIT License.
     HTML