4 سال پیش · e17bc84ea4
--- a/lib/docs/filters/cpp/entries.rb
+++ b/lib/docs/filters/cpp/entries.rb
@@ -1,6 +1,8 @@
 
				 module Docs
			
 
				   class Cpp
			
 
				     class EntriesFilter < Docs::EntriesFilter
			
 
				+      @@duplicate_names = []
			
 
				+
			
 
				       REPLACE_NAMES = {
			
 
				         'Error directive' => '#error directive',
			
 
				         'Filename and line information' => '#line directive',
			
@@ -11,7 +13,8 @@ module Docs
 
				       def get_name
			
 
				         name = at_css('#firstHeading').content.strip
			
 
				         name = format_name(name)
			
 
				-        name.split(',').first
			
 
				+        name = name.split(',').first
			
 
				+        name
			
 
				       end
			
 
				 
			
 
				       def get_type
			
@@ -61,6 +64,21 @@ module Docs
 
				 
			
 
				         REPLACE_NAMES[name] || name
			
 
				       end
			
 
				+
			
 
				+      # Avoid duplicate pages, these duplicate page are the same page for
			
 
				+      # multiple functions that are organized in the same page because provide
			
 
				+      # similar behavior but have different name.
			
 
				+      def entries
			
 
				+        entries = []
			
 
				+
			
 
				+        if !(@@duplicate_names.include?(name))
			
 
				+          @@duplicate_names.push(name)
			
 
				+          entries << default_entry if root_page? || include_default_entry?
			
 
				+          entries.concat(additional_entries)
			
 
				+          build_entries(entries)
			
 
				+        end
			
 
				+      end
			
 
				+
			
 
				     end
			
 
				   end
			
 
				 end
			
--- a/lib/docs/filters/cpp/fix_urls.rb
+++ b/lib/docs/filters/cpp/fix_urls.rb
@@ -1,11 +0,0 @@
 
				-module Docs
			
 
				-  class Cpp
			
 
				-    class FixUrlsFilter < Filter
			
 
				-      def call
			
 
				-        html.gsub! File.join(Cpp.base_url, Cpp.root_path), Cpp.base_url[0..-2]
			
 
				-        html.gsub! %r{#{Cpp.base_url}([^"']+?)\.html}, "#{Cpp.base_url}\\1"
			
 
				-        html
			
 
				-      end
			
 
				-    end
			
 
				-  end
			
 
				-end
			
--- a/lib/docs/filters/cpp20/clean_html.rb
+++ b/lib/docs/filters/cpp20/clean_html.rb
@@ -1,9 +0,0 @@
 
				-module Docs
			
 
				-  class Cpp20
			
 
				-    class CleanHtmlFilter < Filter
			
 
				-      def call
			
 
				-        doc
			
 
				-      end
			
 
				-    end
			
 
				-  end
			
 
				-end
			
--- a/lib/docs/filters/cpp20/entries.rb
+++ b/lib/docs/filters/cpp20/entries.rb
@@ -1,82 +0,0 @@
 
				-module Docs
			
 
				-  class Cpp20
			
 
				-    class EntriesFilter < Docs::EntriesFilter
			
 
				-      @@duplicate_names = []
			
 
				-
			
 
				-      REPLACE_NAMES = {
			
 
				-        'Error directive' => '#error directive',
			
 
				-        'Filename and line information' => '#line directive',
			
 
				-        'Implementation defined behavior control' => '#pragma directive',
			
 
				-        'Replacing text macros' => '#define directive',
			
 
				-        'Source file inclusion' => '#include directive' }
			
 
				-
			
 
				-      def get_name
			
 
				-        name = at_css('#firstHeading').content.strip
			
 
				-        name = format_name(name)
			
 
				-        name = name.split(',').first
			
 
				-        name
			
 
				-      end
			
 
				-
			
 
				-      def get_type
			
 
				-        if at_css('#firstHeading').content.include?('C++ keyword')
			
 
				-          'Keywords'
			
 
				-        elsif subpath.start_with?('experimental')
			
 
				-          'Experimental libraries'
			
 
				-        elsif subpath.start_with?('language/')
			
 
				-          'Language'
			
 
				-        elsif subpath.start_with?('freestanding')
			
 
				-          'Utilities'
			
 
				-        elsif type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
			
 
				-          type.strip!
			
 
				-          type.remove! ' library'
			
 
				-          type.remove! ' utilities'
			
 
				-          type.remove! 'C++ '
			
 
				-          type.capitalize!
			
 
				-          type
			
 
				-        end
			
 
				-      end
			
 
				-
			
 
				-      def additional_entries
			
 
				-        return [] if root_page? || self.name.start_with?('operators')
			
 
				-        names = at_css('#firstHeading').content.remove(%r{\(.+?\)}).split(', ')[1..-1]
			
 
				-        names.each(&:strip!).reject! do |name|
			
 
				-          name.size <= 2 || name == '...' || name =~ /\A[<>]/ || name.start_with?('operator')
			
 
				-        end
			
 
				-        names.map { |name| [format_name(name)] }
			
 
				-      end
			
 
				-
			
 
				-      def format_name(name)
			
 
				-        name.remove! 'C++ concepts: '
			
 
				-        name.remove! 'C++ keywords: '
			
 
				-        name.remove! 'C++ ' unless name == 'C++ language'
			
 
				-        name.remove! %r{\s\(.+\)}
			
 
				-
			
 
				-        name.sub! %r{\AStandard library header <(.+)>\z}, '\1'
			
 
				-        name.sub! %r{(<[^>]+>)}, ''
			
 
				-
			
 
				-        if name.include?('operator') && name.include?(',')
			
 
				-          name.sub!(%r{operator.+([\( ])}, 'operators (') || name.sub!(%r{operator.+}, 'operators')
			
 
				-          name.sub! '  ', ' '
			
 
				-          name << ')' unless name.last == ')' || name.exclude?('(')
			
 
				-          name.sub! '()', ''
			
 
				-          name.sub! %r{\(.+\)}, '' if !name.start_with?('operator') && name.length > 50
			
 
				-        end
			
 
				-
			
 
				-        REPLACE_NAMES[name] || name
			
 
				-      end
			
 
				-
			
 
				-      def entries
			
 
				-        entries = []
			
 
				-
			
 
				-        # avoid duplicate pages
			
 
				-        if !(@@duplicate_names.include?(name))
			
 
				-          @@duplicate_names.push(name)
			
 
				-          entries << default_entry if root_page? || include_default_entry?
			
 
				-          entries.concat(additional_entries)
			
 
				-          build_entries(entries)
			
 
				-        end
			
 
				-      end
			
 
				-
			
 
				-    end
			
 
				-  end
			
 
				-end
			
--- a/lib/docs/filters/cppref/clean_html.rb
+++ b/lib/docs/filters/cppref/clean_html.rb
@@ -0,0 +1,116 @@
 
				+module Docs
			
 
				+  class Cppref
			
 
				+    class CleanHtmlFilter < Filter
			
 
				+      def call
			
 
				+        css('h1').remove if root_page?
			
 
				+
			
 
				+        css('.t-dcl-rev-aux td[rowspan]').each do |node|
			
 
				+          rowspan = node['rowspan'].to_i
			
 
				+          node['rowspan'] = node.ancestors('tbody').css('tr').length if rowspan > 3
			
 
				+        end
			
 
				+
			
 
				+        css('#siteSub', '#contentSub', '.printfooter', '.t-navbar', '.editsection', '#toc',
			
 
				+            '.t-dsc-sep', '.t-dcl-sep', '#catlinks', '.ambox-notice', '.mw-cite-backlink',
			
 
				+            '.t-sdsc-sep:first-child:last-child', '.t-example-live-link',
			
 
				+            '.t-dcl-rev-num > .t-dcl-rev-aux ~ tr:not(.t-dcl-rev-aux) > td:nth-child(2)').remove
			
 
				+
			
 
				+        css('#bodyContent', '.mw-content-ltr', 'span[style]', 'div[class^="t-ref"]', '.t-image',
			
 
				+            'th > div', 'td > div', '.t-dsc-see', '.mainpagediv', 'code > b', 'tbody').each do |node|
			
 
				+          node.before(node.children).remove
			
 
				+        end
			
 
				+
			
 
				+        css('div > ul').each do |node|
			
 
				+          node.parent.before(node.parent.children).remove
			
 
				+        end
			
 
				+
			
 
				+        css('dl > dd:first-child:last-child > ul:first-child:last-child').each do |node|
			
 
				+          dl = node.parent.parent
			
 
				+          if dl.previous_element && dl.previous_element.name == 'ul'
			
 
				+            dl.previous_element << node
			
 
				+            dl.remove
			
 
				+          else
			
 
				+            dl.before(node).remove
			
 
				+          end
			
 
				+        end
			
 
				+
			
 
				+        css('dl > dd:first-child:last-child').each do |node|
			
 
				+          node.parent.before(node.children).remove
			
 
				+        end
			
 
				+
			
 
				+        css('ul').each do |node|
			
 
				+          while node.next_element && node.next_element.name == 'ul'
			
 
				+            node << node.next_element.children
			
 
				+            node.next_element.remove
			
 
				+          end
			
 
				+        end
			
 
				+
			
 
				+        css('h2 > span[id]', 'h3 > span[id]', 'h4 > span[id]', 'h5 > span[id]', 'h6 > span[id]').each do |node|
			
 
				+          node.parent['id'] = node['id']
			
 
				+          node.before(node.children).remove
			
 
				+        end
			
 
				+
			
 
				+        css('table[style]', 'th[style]', 'td[style]').remove_attr('style')
			
 
				+        css('table[cellpadding]').remove_attr('cellpadding')
			
 
				+
			
 
				+        css('.t-dsc-hitem > td', '.t-dsc-header > td').each do |node|
			
 
				+          node.name = 'th'
			
 
				+          node.content = ' ' if node.content.empty?
			
 
				+        end
			
 
				+
			
 
				+        css('tt', 'span > span.source-cpp', 'span.t-c', 'span.t-lc', 'span.t-dsc-see-tt').each do |node|
			
 
				+          node.name = 'code'
			
 
				+          node.remove_attribute('class')
			
 
				+          node.content = node.content unless node.at_css('a')
			
 
				+        end
			
 
				+
			
 
				+        css('div > span.source-cpp').each do |node|
			
 
				+          node.name = 'pre'
			
 
				+          node.inner_html = node.inner_html.gsub('<br>', "\n")
			
 
				+          node.content = node.content
			
 
				+        end
			
 
				+
			
 
				+        css('div > a > img[alt="About this image"]').each do |node|
			
 
				+          node.parent.parent.remove
			
 
				+        end
			
 
				+
			
 
				+        css('area[href]').each do |node|
			
 
				+          node['href'] = node['href'].remove('.html')
			
 
				+        end
			
 
				+
			
 
				+        css('p').each do |node|
			
 
				+          while node.next && (node.next.text? || node.next.name == 'a' || node.next.name == 'code')
			
 
				+            node << node.next
			
 
				+          end
			
 
				+          node.inner_html = node.inner_html.strip
			
 
				+          node << '.' if node.content =~ /[a-zA-Z0-9\)]\z/
			
 
				+          node.remove if node.content.blank? && !node.at_css('img')
			
 
				+        end
			
 
				+
			
 
				+        css('pre').each do |node|
			
 
				+          node['data-language'] = if node['class'].try(:include?, 'cpp') || node.parent['class'].try(:include?, 'cpp')
			
 
				+            'cpp'
			
 
				+          else
			
 
				+            'c'
			
 
				+          end
			
 
				+          node.remove_attribute('class')
			
 
				+          node.content = node.content.gsub("\t", ' ' * 8)
			
 
				+        end
			
 
				+
			
 
				+        css('code code', '.mw-geshi').each do |node|
			
 
				+          node.before(node.children).remove
			
 
				+        end
			
 
				+
			
 
				+        css('h1 ~ .fmbox').each do |node|
			
 
				+          node.name = 'div'
			
 
				+          node.content = node.content
			
 
				+        end
			
 
				+
			
 
				+        css('img').each do |node|
			
 
				+          node['src'] = node['src'].sub! %r{http://en.cppreference.com/common/([^"']+?)\.svg}, 'http://upload.cppreference.com/mwiki/\1.svg'
			
 
				+        end
			
 
				+
			
 
				+        doc
			
 
				+      end
			
 
				+    end
			
 
				+  end
			
 
				+end
			
--- a/lib/docs/filters/cppref/fix_code.rb
+++ b/lib/docs/filters/cppref/fix_code.rb
@@ -0,0 +1,21 @@
 
				+module Docs
			
 
				+  class Cppref
			
 
				+    class FixCodeFilter < Filter
			
 
				+      def call
			
 
				+        css('div > span.source-c', 'div > span.source-cpp').each do |node|
			
 
				+          node.inner_html = node.inner_html.gsub(/<br>\n?/, "\n").gsub("\n</p>\n", "</p>\n")
			
 
				+          node.parent.name = 'pre'
			
 
				+          node.parent['class'] = node['class']
			
 
				+          node.parent.content = node.content
			
 
				+        end
			
 
				+
			
 
				+        nbsp = Nokogiri::HTML('&nbsp;').text
			
 
				+        css('pre').each do |node|
			
 
				+          node.content = node.content.gsub(nbsp, ' ')
			
 
				+        end
			
 
				+
			
 
				+        doc
			
 
				+      end
			
 
				+    end
			
 
				+  end
			
 
				+end
			
--- a/lib/docs/scrapers/cpp.rb
+++ b/lib/docs/scrapers/cpp.rb
@@ -1,52 +0,0 @@
 
				-module Docs
			
 
				-  class Cpp < FileScraper
			
 
				-    self.name = 'C++'
			
 
				-    self.slug = 'cpp'
			
 
				-    self.type = 'c'
			
 
				-    self.base_url = 'http://en.cppreference.com/w/cpp/'
			
 
				-    self.root_path = 'header.html'
			
 
				-
			
 
				-    html_filters.insert_before 'clean_html', 'c/fix_code'
			
 
				-    html_filters.push 'cpp/entries', 'c/clean_html', 'title'
			
 
				-    text_filters.push 'cpp/fix_urls'
			
 
				-
			
 
				-    options[:decode_and_clean_paths] = true
			
 
				-    options[:container] = '#content'
			
 
				-    options[:title] = false
			
 
				-    options[:root_title] = 'C++ Programming Language'
			
 
				-    options[:skip] = %w(
			
 
				-      language/extending_std.html
			
 
				-      language/history.html
			
 
				-      regex/ecmascript.html
			
 
				-      regex/regex_token_iterator/operator_cmp.html
			
 
				-    )
			
 
				-    options[:skip_patterns] = [/experimental/]
			
 
				-    options[:only_patterns] = [/\.html\z/]
			
 
				-
			
 
				-    options[:fix_urls] = ->(url) do
			
 
				-      url = CGI.unescape(url)
			
 
				-      url.sub! %r{\A.+/http%3A/}, 'http://'
			
 
				-      url.sub! 'http://en.cppreference.com/upload.cppreference.com', 'http://upload.cppreference.com'
			
 
				-      url
			
 
				-    end
			
 
				-
			
 
				-    options[:attribution] = <<-HTML
			
 
				-      &copy; cppreference.com<br>
			
 
				-      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
			
 
				-    HTML
			
 
				-
			
 
				-    # Same as get_latest_version in lib/docs/scrapers/c.rb
			
 
				-    def get_latest_version(opts)
			
 
				-      doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
			
 
				-      link = doc.at_css('a[title^="File:"]')
			
 
				-      date = link.content.scan(/(\d+)\./)[0][0]
			
 
				-      DateTime.strptime(date, '%Y%m%d').to_time.to_i
			
 
				-    end
			
 
				-
			
 
				-    private
			
 
				-
			
 
				-    def file_path_for(*)
			
 
				-      URI.unescape(super)
			
 
				-    end
			
 
				-  end
			
 
				-end
			
--- a/lib/docs/scrapers/cppref/cpp.rb
+++ b/lib/docs/scrapers/cppref/cpp.rb
@@ -1,17 +1,12 @@
 
				 module Docs
			
 
				-  class Cpp20 < UrlScraper
			
 
				-    self.name = 'C++20'
			
 
				-    self.slug = 'cpp20'
			
 
				+  class Cpp < Cppref
			
 
				+    self.name = 'C++'
			
 
				+    self.slug = 'cpp'
			
 
				     self.type = 'c'
			
 
				     self.base_url = 'https://en.cppreference.com/w/cpp/'
			
 
				-    self.root_path = 'header'
			
 
				 
			
 
				-    html_filters.insert_before 'clean_html', 'c/fix_code'
			
 
				-    html_filters.push 'cpp20/entries', 'c/clean_html', 'title'
			
 
				+    html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
			
 
				 
			
 
				-    options[:decode_and_clean_paths] = true
			
 
				-    options[:container] = '#content'
			
 
				-    options[:title] = false
			
 
				     options[:root_title] = 'C++ Programming Language'
			
 
				 
			
 
				     options[:skip] = %w(
			
@@ -21,13 +16,6 @@ module Docs
 
				       regex/regex_token_iterator/operator_cmp.html
			
 
				     )
			
 
				 
			
 
				-    options[:skip_patterns] = [/experimental/]
			
 
				-
			
 
				-    options[:attribution] = <<-HTML
			
 
				-      &copy; cppreference.com<br>
			
 
				-      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
			
 
				-    HTML
			
 
				-
			
 
				     # Same as get_latest_version in lib/docs/scrapers/c.rb
			
 
				     def get_latest_version(opts)
			
 
				       doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
			
--- a/lib/docs/scrapers/cppref/cppref.rb
+++ b/lib/docs/scrapers/cppref/cppref.rb
@@ -0,0 +1,29 @@
 
				+module Docs
			
 
				+  class Cppref < UrlScraper
			
 
				+    self.abstract = true
			
 
				+    self.type = 'cppref'
			
 
				+    self.root_path = 'header'
			
 
				+
			
 
				+    html_filters.insert_before 'clean_html', 'cppref/fix_code'
			
 
				+    html_filters.push  'cppref/clean_html', 'title'
			
 
				+      # 'cpp20/entries',
			
 
				+    options[:decode_and_clean_paths] = true
			
 
				+    options[:container] = '#content'
			
 
				+    options[:title] = false
			
 
				+    options[:skip] = %w(language/history.html)
			
 
				+
			
 
				+    options[:skip_patterns] = [
			
 
				+      /experimental/
			
 
				+    ]
			
 
				+
			
 
				+    options[:attribution] = <<-HTML
			
 
				+      &copy; cppreference.com<br>
			
 
				+      Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
			
 
				+    HTML
			
 
				+
			
 
				+    # def get_latest_version
			
 
				+
			
 
				+    # end
			
 
				+
			
 
				+  end
			
 
				+end