瀏覽代碼

Re-implement Angular.js scraper

Fixes #23. Thanks @TheRusskiy and @afram.
Thibaut 12 年之前
父節點
當前提交
faf6ecf549

+ 7 - 2
Gemfile

@@ -31,11 +31,16 @@ group :docs do
   gem 'typhoeus'
   gem 'nokogiri', '~> 1.6.0'
   gem 'html-pipeline'
-  gem 'progress_bar'
-  gem 'unix_utils'
+  gem 'progress_bar', require: false
+  gem 'unix_utils', require: false
 end
 
 group :test do
   gem 'minitest'
   gem 'rr', require: false
 end
+
+if ENV['SELENIUM'] == '1'
+  gem 'capybara'
+  gem 'selenium-webdriver'
+end

+ 1 - 1
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -75,7 +75,7 @@ app.templates.aboutPage = -> """
 
 credits = [
   [ 'Angular.js',
-    '2010-2013 Google, Inc.',
+    '2010-2014 Google, Inc.',
     'CC BY',
     'http://creativecommons.org/licenses/by/3.0/'
   ], [

+ 4 - 1
assets/javascripts/templates/pages/news_tmpl.coffee

@@ -24,7 +24,10 @@ newsItem = (date, news) ->
   result
 
 app.news = [
-  [ 1390089600001, # January 19, 2013
+  [ 1390694400000, # January 26, 2013
+    """ Updated <a href="/angular/">Angular.js</a> documentation """,
+  ], [
+    390089600001, # January 19, 2013
     """ New <a href="/d3/">D3.js</a> and <a href="/knockout/">Knockout.js</a> documentations """,
   ], [
     1390003200000, # January 18, 2013

+ 2 - 1
assets/javascripts/views/pages/angular.coffee

@@ -2,5 +2,6 @@
 
 class app.views.AngularPage extends app.views.BasePage
   afterRender: ->
-    @highlightCode @findAllByClass('prettyprint'), 'javascript'
+    for el in @findAllByTag('pre')
+      @highlightCode el, if el.textContent[0] is '<' then 'markup' else 'javascript'
     return

+ 40 - 3
assets/stylesheets/pages/_angular.scss

@@ -1,8 +1,39 @@
 ._angular {
-  > h2 { font-size: 1.125rem; }
+  h2 { @extend %block-heading; }
+
+  //
+  // Index
+  //
+
+  .nav-header.section {
+    margin: 1.5em 0 1em -2em;
+    list-style: none;
+    font-weight: bold;
+    text-transform: capitalize;
+  }
+
+  //
+  // Other
+  //
+
   h3, h4 { font-size: 1rem; }
 
-  .methods {
+  .alert { @extend %note; }
+  .alert-success { @extend %note-green; }
+  .alert-error { @extend %note-red; }
+
+  p > code, li > code, td > code { @extend %label; }
+
+  .view-source, .improve-docs {
+    position: relative;
+    float: right;
+    line-height: 1.7rem;
+    padding-left: 1em;
+    font-size: .875rem;
+    background: white;
+  }
+
+  .defs {
     padding-left: 1rem;
     list-style: none;
 
@@ -12,6 +43,12 @@
     }
 
     > li + li { margin-top: 2em; }
-    > li > ul { list-style-type: disc; }
+
+    h4 {
+      margin: 1em 0 .5em;
+      font-size: 1em;
+    }
+
+    ul { list-style-type: disc; }
   }
 }

+ 86 - 0
lib/docs/filters/angular/clean_html.rb

@@ -0,0 +1,86 @@
+module Docs
+  class Angular
+    class CleanHtmlFilter < Filter
+      def call
+        # Fix internal links (remove colons)
+        css('a[href]').each do |node|
+          node['href'] = node['href'].gsub %r{(directive|filter):}, '\1-'
+        end
+
+        root_page? ? root : other
+        doc
+      end
+
+      def root
+        css('.pull-right', '.ng-hide').remove
+
+        # Turn "module [...]" <li> into <h2>
+        css('.nav-header.module').each do |node|
+          node.name = 'h2'
+          node.parent.before(node)
+        end
+
+        # Remove links to "Directive", "Filter", etc.
+        css('a.guide').each do |node|
+          node.replace(node.content)
+        end
+      end
+
+      def other
+        css('#example', '.example', '#description_source', '#description_demo', '[id$="example"]').remove
+
+        if at_css('h1').content.strip.empty?
+          # Ensure proper <h1> (e.g. ngResource, AUTO, etc.)
+          at_css('h2').tap do |node|
+            at_css('h1').content = node.try(:content) || slug
+            node.try(:remove)
+          end
+        else
+          # Clean up .hint in <h1>
+          css('h1 > div > .hint').each do |node|
+            node.parent.before("<small>(#{node.content.strip})</small>").remove
+          end
+        end
+
+        at_css('h1').add_child(css('.view-source', '.improve-docs'))
+
+        # Remove root-level <div>
+        while div = at_css('h1 + div')
+          div.before(div.children)
+          div.remove
+        end
+
+        # Remove dead links (e.g. ngRepeat)
+        css('a.type-hint').each do |node|
+          node.name = 'code'
+          node.remove_attribute 'href'
+        end
+
+        # Remove some <code> elements
+        css('h1 > code', 'pre > code', 'h6 > code').each do |node|
+          node.before(node.content).remove
+        end
+
+        # Fix code indentation
+        css('code', 'pre').each do |node|
+          node.inner_html = node.inner_html.strip_heredoc.strip
+        end
+
+        # Make <pre> elements
+        css('.in-javascript', '.in-html-template-binding').each do |node|
+          node.name = 'pre'
+          node.content = node.content
+        end
+
+        css('ul.methods', 'ul.properties', 'ul.events').add_class('defs')
+
+        # Remove ng-* attributes
+        css('*').each do |node|
+          node.attributes.each_key do |attribute|
+            node.remove_attribute(attribute) if attribute.start_with? 'ng-'
+          end
+        end
+      end
+    end
+  end
+end

+ 11 - 0
lib/docs/filters/angular/clean_urls.rb

@@ -0,0 +1,11 @@
+module Docs
+  class Angular
+    class CleanUrlsFilter < Filter
+      def call
+        html.gsub! 'angularjs.org/partials/api/', 'angularjs.org/api/'
+        html.gsub! %r{angularjs.org/api/(.+?)\.html}, 'angularjs.org/api/\1'
+        html
+      end
+    end
+  end
+end

+ 40 - 0
lib/docs/filters/angular/entries.rb

@@ -0,0 +1,40 @@
+module Docs
+  class Angular
+    class EntriesFilter < Docs::EntriesFilter
+      def get_name
+        name = slug.split(':').last
+        name.sub! %r{\Ang\.}, ''
+        name << " (#{subtype})" if subtype == 'directive' || subtype == 'filter'
+        name
+      end
+
+      def get_type
+        type = slug.split('.').first
+        type << " #{subtype}s" if type == 'ng' && subtype
+        type
+      end
+
+      def subtype
+        return @subtype if defined? @subtype
+        node = at_css 'h1'
+        data = node.content.match %r{\((.+) in module} if node
+        @subtype = data && data[1]
+      end
+
+      def additional_entries
+        entries = []
+
+        css('ul.defs').each do |list|
+          list.css('> li > h3:first-child').each do |node|
+            name = node.content.strip
+            name.sub! %r{\(.+\)}, '()'
+            name.prepend "#{self.name.split.first}."
+            entries << [name, node['id']]
+          end
+        end
+
+        entries
+      end
+    end
+  end
+end

+ 53 - 12
lib/docs/scrapers/angular.rb

@@ -1,19 +1,60 @@
 module Docs
   class Angular < UrlScraper
-    # This scraper is currently broken; the problem being that Angular's
-    # documentation isn't available as static pages. I will try to restore it
-    # once Angular 1.2.0 is released.
-    #
-    # In the past it used static-ng-doc by Sal Lara (github.com/natchiketa/static-ng-doc)
-    # to scrape the doc's HTML partials (e.g. docs.angularjs.org/partials/api/ng.html).
-    #
-    # If you want to help this is what I need: a static page with links to each
-    # HTML partial. Or better yet, a static version of Angular's documentation.
-
     self.name = 'Angular.js'
     self.slug = 'angular'
     self.type = 'angular'
-    self.version = '1.0.7'
-    self.base_url = ''
+    self.version = '1.2.10'
+    self.base_url = 'http://docs.angularjs.org/partials/api/'
+
+    html_filters.insert_before 'normalize_paths', 'angular/clean_html'
+    html_filters.push 'angular/entries', 'title'
+    text_filters.push 'angular/clean_urls'
+
+    options[:title] = false
+    options[:root_title] = 'Angular.js'
+
+    options[:fix_urls] = ->(url) do
+      url.sub! '/partials/api/api/', '/partials/api/'
+      url.sub! '/partials/api/guide/', '/guide/'
+      url.sub! %r{/partials/api/(.+?)(?<!\.html)(?:\z|(#.*))}, '/partials/api/\1.html\2'
+      url.gsub! '/partials/api/(.+?)\:', '/partials/api/\1%3A'
+      url
+    end
+
+    options[:skip] = %w(ng.html)
+
+    options[:attribution] = <<-HTML
+      &copy; 2010&ndash;2014 Google, Inc.<br>
+      Licensed under the Creative Commons Attribution License 3.0.
+    HTML
+
+    private
+
+    def request_one(url)
+      stub_root_page if url == root_url.to_s
+      super
+    end
+
+    def request_all(urls, &block)
+      stub_root_page
+      super
+    end
+
+    def stub_root_page
+      response = Typhoeus::Response.new(
+        effective_url: root_url.to_s,
+        code: 200,
+        headers: { 'Content-Type' => 'text/html' },
+        body: get_root_page_body)
+
+      Typhoeus.stub(root_url.to_s).and_return(response)
+    end
+
+    def get_root_page_body
+      require 'capybara'
+      Capybara.current_driver = :selenium
+      Capybara.visit('http://docs.angularjs.org/api/')
+      Capybara.find('.side-navigation')['innerHTML']
+    end
   end
 end