瀏覽代碼

Update Haskell scraper

Thibaut 11 年之前
父節點
當前提交
1819d71ff7

二進制
assets/images/icons.png


二進制
assets/images/icons@2x.png


+ 5 - 0
assets/javascripts/templates/pages/about_tmpl.coffee

@@ -130,6 +130,11 @@ credits = [
     '2014 Grunt Team',
     'MIT',
     'https://raw.githubusercontent.com/gruntjs/gruntjs.com/master/LICENSE'
+  ], [
+    'Haskell',
+    'The University of Glasgow',
+    'BSD',
+    'http://www.haskell.org/ghc/license'
   ], [
     'HTTP',
     '1999 The Internet Society',

+ 4 - 1
assets/javascripts/templates/pages/news_tmpl.coffee

@@ -24,7 +24,10 @@ newsItem = (date, news) ->
   result
 
 app.news = [
-  [ 1400976000000, # May 25, 2014
+  [ 1402704000000, # June 14, 2014
+    """ New <a href="/haskell/">Haskell</a> documentation """,
+  ], [
+    1400976000000, # May 25, 2014
     """ New <a href="/laravel/">Laravel</a> documentation """,
   ], [
     1399161600000, # May 4, 2014

+ 1 - 0
assets/stylesheets/application.css.scss

@@ -35,6 +35,7 @@
         'pages/ember',
         'pages/express',
         'pages/go',
+        'pages/haskell',
         'pages/jquery',
         'pages/knockout',
         'pages/git',

+ 1 - 0
assets/stylesheets/global/_icons.scss

@@ -57,3 +57,4 @@
 ._icon-grunt:before         { background-position: -3rem -8rem; }
 ._icon-maxcdn:before        { background-position: -4rem -8rem; }
 ._icon-laravel:before       { background-position: 0 -9rem; }
+._icon-haskell:before       { background-position: -1rem -9rem; }

+ 18 - 114
assets/stylesheets/pages/_haskell.scss

@@ -1,121 +1,25 @@
-._icon-haskell:before {
-  background-image: image-url('/icons/docs/haskell/16.png');
-  background-size: cover;
-  background-repeat: no-repeat;
-}
-
-.empty-table .empty {
-  display: none;
-}
-
-.arguments td.src {
-    background: #faf9e2;
-    width: 30%;
-}
-
-th.src,
-td.src {
-  font-family: $monoFont;
-  font-weight: normal;
-  font-style: normal;
-  background: #f8f8f8;
-}
-
-caption {
-  font-weight: bold;
-  text-align: left;
-  font-style: italic;
-  font-size: 1.1em;
-}
-
-// remove margin in descript listing
-dd > pre {
-  @extend %pre;
-  margin: 0;
-  background: #faf9e2;
-  border-color: #dddaaa #dddaaa #d7d7a9;
-}
-
-// warnings are red
-.warning {
-    @extend %note;
-    @extend %note-red;
-}
-
-
-// complexity classes are blue boxes
-.with-complexity {
-  display: flex;
-  display: -webkit-flex;
-
-  justify-content: space-between;
-  -webkit-justify-content: space-between;
+._haskell {
+  > h2 { @extend %block-heading; }
+  > h3 { @extend %block-label; }
+  h4 { font-size: 1em; }
 
-  align-items: flex-start;
-  -webkit-align-items: flex-start;
+  .module + .package, p.src > .link { float: right; }
 
-  align-content: stretch;
-  -webkit-align-content: stretch;
-}
-
-.complexity {
-    @extend %note;
-    @extend %note-blue;
-    margin: 0;
-    margin-left: 1em;
-    margin-bottom: 0.75em;
-    font-style: italic;
-    white-space: nowrap;
-
-    flex-shrink: 0;
-    -webkit-flex-shrink: 0;
-
-    order: 2;
-    -webkit-order: 2;
-}
-
-.complexity + span {
-  order: 1;
-    -webkit-order: 1;
-}
-
-// add box type to "since: ..."
-.added {
-    @extend %note;
-    @extend %note-gold;
-}
-
-.added-cell {
-    @extend %note-gold;
-}
-
-.fields h3 {
-  display: none;
-}
+  .src {
+    white-space: normal;
+    @extend %code;
+  }
+  p.src { @extend %block-label, %label-blue; }
+  dt.src { white-space: normal; }
 
-// separate types more
-.src {
-    margin-top: 2.5em;
-}
+  .top > .subs { margin-left: 2em; }
+  .subs p.src { margin-top: 1em; }
 
-h1 + .top .src,
-h2 + .top .src,
-h3 + .top .src,
-.caption + .top .src {
-  margin-top: 0;
-}
+  dt > code, .complexity, .version { @extend %label; }
+  .complexity, .version { @extend %label-green; }
 
-// but not for first type
-h1 + .top,
-h2 + .top,
-h3 + .top,
-h4 + .top {
-  margin-top: 0;
-}
+  table { margin: 1em 0; }
+  td > pre { margin: 0; }
 
-// change color of example code
-.example {
-    border: 1px solid;
-    background: #faf9e2;
-    border-color: #dddaaa #dddaaa #d7d7a9;
+  .warning { @extend %note; }
 }

+ 31 - 130
lib/docs/filters/haskell/clean_html.rb

@@ -2,146 +2,54 @@ module Docs
   class Haskell
     class CleanHtmlFilter < Filter
       def call
+        root_page? ? root : other
+        doc
+      end
 
-        # remove unwanted elements
-        css('#footer', '#package-header', '#module-header', '#synopsis', '.link', '#table-of-contents', '.package').remove
-
-        # cpations in tables are h3
-        css('table .caption').each do |node|
-          node.name = 'h3'
-        end
-
-        # turn captions into real headers
-        css('.caption').each do |node|
-          node.name = 'h1'
-        end
-
-        # section
-        css('.top > .caption').each do |node|
-          node.name = 'h2'
-        end
-
-        # subsections
-        css('.top > .subs > .caption', '.fields > .caption').each do |node|
-          node.name = 'h3'
-        end
-
-        # subsubsections
-        css('.top > .subs > .subs > .caption').each do |node|
-          node.name = 'h4'
-        end
-
-        # ...
-        css('.top > .subs > .subs > .subs > .caption').each do |node|
-          node.name = 'h5'
-        end
-
-        # ......
-        css('.top > .subs > .subs > .subs > .subs > .caption').each do |node|
-          node.name = 'h6'
-        end
-
-        # all pre's are examples
-        css('pre').each do |node|
-          node.add_css_class('example')
+      def root
+        css('#description', '#module-list').each do |node|
+          node.before(node.children).remove
         end
+      end
 
-        # turn source listing in to pre
-        css('.src').each do |node|
-          if node.name != "td"
-            node.name = 'pre'
-          end
+      def other
+        css('h1').each do |node|
+          node.remove if node.content == 'Documentation'
         end
 
-        # check if second column of table is totally empty.
-        # and remove it if it is
-        css('table').each do |table|
-          empty = true
-          table.css('td + td').each do |snd|
-            empty = empty && snd['class'] =~ /empty/
-          end
-          if empty
-            # remove empty column
-            table.css('td + td').remove
-          end
+        css('h1, h2, h3, h4').each do |node|
+          node.name = node.name.sub(/\d/) { |i| i.to_i + 1 }
         end
 
-        # move table captions into the tables
-        css(".caption + table").each do |table|
-          caption = table.previous
-          caption.name = "caption"
-          caption.parent = table
+        at_css('#module-header').tap do |node|
+          heading = at_css('.caption')
+          heading.name = 'h1'
+          node.before(heading)
+          node.before(node.children).remove
         end
 
-        css(".caption + .show table").each do |table|
-          caption = table.parent.parent.css('.caption')[0]
-          caption.name = 'caption'
-          caption.parent = table
-        end
+        css('#synopsis').remove
 
-        # better arguments display:
-        css('.src + .arguments table').each do |table|
-          src = table.parent.previous # the function name
-          row = doc.document.create_element('tr')
-          table.css('tr')[0].before(row)
-          src.parent = row
-          src.name = "th"
-          src['colspan'] = 2
+        css('#interface', 'h2 code').each do |node|
+          node.before(node.children).remove
         end
 
-        # remove root page title
-        if root_page?
-          at_css('h1').remove
+        css('a[name]').each do |node|
+          node['id'] = node['name']
+          node.remove_attribute('name')
         end
 
-        # add id to links (based on name)
-        css('a').each do |node|
-          if node['name']
-            node['id'] = node['name']
-          end
-        end
-
-        # make code in description into proper pre
-        css('dd > code').each do |node|
-          node.name = 'pre'
+        css('p.caption').each do |node|
+          node.name = 'h4'
         end
 
-        # add some informational boxes
         css('em').each do |node|
-          if node.content == 'Deprecated.'
-            # Make deprecated messages red.
-            node.parent.add_css_class('warning')
-          elsif node.content =~ /O\(.*\)/
-            # this is big_O notation, but only apply the class if this is not
-            # inside running text (it must be at the start of a paragraph)
-            # from:
-            # <p><em>O(n)</em>. Koel ok</p>
-            # to:
-            # <p class="with-complexity">
-            #   <span class="complexity">O(n)</span>
-            #   <span>Koel ok</span>
-            # </p>
-            if node.previous == nil
-              node.add_css_class('complexity')                        # add css class
-              node.name="span"                                        # just make it div
-              node.next.content = node.next.content.gsub(/^. /, "")   # remove . if directly after em
-              node.content = node.content.gsub(/\.$/, "")             # remove trailing . if it's inside em
-
-              # reparent the nodes
-              cont = doc.document.create_element "p", :class => "with-complexity"
-              node.parent.previous = cont
-              par = node.parent
-              node.parent = cont
-              par.parent = cont
-              par.name = "span"
-            end
-          elsif node.content =~ /Since: .*/
-            # add box to 'Since:' annotations
-            if node.parent.parent.name == "td"
-              node.parent.parent.add_css_class('added-cell')
-            else
-              node.add_css_class('added')
-            end
+          if node.content.start_with?('O(')
+            node.name = 'span'
+            node['class'] = 'complexity'
+          elsif node.content.start_with?('Since')
+            node.name = 'span'
+            node['class'] = 'version'
           end
         end
 
@@ -150,10 +58,3 @@ module Docs
     end
   end
 end
-
-class Nokogiri::XML::Node
-  def add_css_class( *classes )
-    existing = (self['class'] || "").split(/\s+/)
-    self['class'] = existing.concat(classes).uniq.join(" ")
-  end
-end

+ 40 - 41
lib/docs/filters/haskell/entries.rb

@@ -1,55 +1,54 @@
 module Docs
   class Haskell
     class EntriesFilter < Docs::EntriesFilter
+      IGNORE_ENTRIES_PATHS = %w(
+        bytestring-0.10.4.0/Data-ByteString-Lazy.html
+        bytestring-0.10.4.0/Data-ByteString-Char8.html
+        bytestring-0.10.4.0/Data-ByteString-Lazy-Char8.html
+        array-0.5.0.0/Data-Array-IArray.html
+        containers-0.5.5.1/Data-IntMap-Lazy.html
+        containers-0.5.5.1/Data-Map-Lazy.html
+        unix-2.7.0.1/System-Posix-Files-ByteString.html
+        filepath-1.3.0.2/System-FilePath-Windows.html
+        transformers-0.3.0.0/Control-Monad-Trans-RWS-Lazy.html
+        transformers-0.3.0.0/Control-Monad-Trans-Writer-Lazy.html
+        base-4.7.0.0/GHC-Conc-Sync.html
+        base-4.7.0.0/GHC-IO-Encoding-UTF32.html
+        unix-2.7.0.1/System-Posix-Terminal-ByteString.html)
 
-      # gets name and type in one fell swoop
-      # 
-      # eg.
-      #  Control.Monad > [Monad, Control]
-      #  Control.Concurrent.Mvar > [Concurrent.MVar, Control]
-      #  Array > [Array, nil]
-      def get_name_and_type
-        if at_css('h1') && at_css('h1').content == 'Haskell Hierarchical Libraries'
-          puts 'ok'
-          name = 'Haskell'
-          type = nil
-        else
-          # find full module identifier
-          caption = at_css('#module-header .caption')
+      def get_name
+        at_css('#module-header .caption').content.strip
+      end
 
-          if caption
-            # split the module path
-            parts   = caption.content.split('.')
+      def get_type
+        %w(System.Posix System.Win32 Control.Monad).each do |type|
+          return type if name.start_with?(type)
+        end
 
-            if parts.length > 1
-              # if more than one part then the 
-              # first is the type and the rest is the name
-              type = parts[0]
-              name = parts.drop(1).join('.')
-            else
-              # if only one part, this is the name
-              name = parts[0]
-              type = nil
-            end
-          else
-            # no caption found -> no type / no name
-            name = 'no-name'
-            type = 'no-type'
-          end
+        if name.start_with?('Data')
+          name.split('.')[0..1].join('.')
+        else
+          name.split('.').first
         end
-        [name, type]
       end
 
-      # get the name
-      def get_name
-        n, t = get_name_and_type()
-        n
+      def additional_entries
+        return [] if IGNORE_ENTRIES_PATHS.include?(subpath)
+
+        css('#synopsis > ul > li').each_with_object [] do |node, entries|
+          link = node.at_css('a')
+          next unless link['href'].start_with?('#')
+          name = node.content.strip
+          name.remove! %r{\A(?:module|data|newtype|class|type family m|type)\s+}
+          name.sub! %r{\A\((.+?)\)}, '\1'
+          name.sub!(/ (?:\:\: (\w+))?.+\z/) { |_| $1 ? " (#{$1})" : '' }
+          next if name == self.name
+          entries << [name, link['href'].remove('#')]
+        end
       end
 
-      # get the type
-      def get_type
-        n, t = get_name_and_type()
-        t
+      def include_default_entry?
+        at_css('#synopsis > ul > li')
       end
     end
   end

+ 19 - 11
lib/docs/scrapers/haskell.rb

@@ -1,24 +1,32 @@
 module Docs
   class Haskell < UrlScraper
     self.name = 'Haskell'
-    self.slug = 'haskell'
     self.type = 'haskell'
     self.version = '7.8.2'
     self.base_url = 'http://www.haskell.org/ghc/docs/7.8.2/html/libraries/'
-    self.initial_paths = ['/index.html']
+    self.root_path = 'index.html'
 
-    html_filters.push 'haskell/entries'
-    html_filters.push 'haskell/clean_html'
-    html_filters.push 'title'
+    html_filters.push 'haskell/entries', 'haskell/clean_html'
 
+    options[:container] = '#content'
 
-    options[:container]     = '#content'
-    options[:skip_patterns] = [/src/, /index/, /haskell2010/, /ghc-/, /Cabal-/]   # skip source listings and index files
+    options[:skip] = %w(
+      hoopl-3.10.0.1/Compiler-Hoopl-Internals.html
+      base-4.7.0.0/Control-Exception-Base.html
+      binary-0.7.1.0/Data-Binary-Get-Internal.html
+      template-haskell-2.9.0.0/Language-Haskell-TH-Lib.html
+      haskell98-2.0.0.3/Prelude.html
+      pretty-1.1.1.1/Text-PrettyPrint.html
+      base-4.7.0.0/Data-OldTypeable-Internal.html
+      base-4.7.0.0/Data-Typeable-Internal.html
+      base-4.7.0.0/GHC-IO-Encoding-Types.html
+      unix-2.7.0.1/System-Posix-Process-Internals.html)
+
+    options[:skip_patterns] = [/src\//, /doc-index/, /haskell2010/, /ghc-/, /Cabal-/]
 
     options[:attribution] = <<-HTML
-      &copy; The University Court of the University of Glasgow.<br>
-      All rights reserved. <a href="http://www.haskell.org/ghc/license">See here for more info</a>
+      &copy; The University of Glasgow and others<br>
+      Licensed under a BSD-style license (see top of the page).
     HTML
-
-    end
+  end
 end

二進制
public/icons/docs/haskell/16.png


二進制
public/icons/docs/haskell/16@2x.png


+ 1 - 0
public/icons/docs/haskell/SOURCE

@@ -0,0 +1 @@
+http://www.haskell.org/haskellwiki/Thompson-Wheeler_logo