Browse Source

Improve PHP scraper

Fixes #20
Thibaut 12 years ago
parent
commit
cad60c6170
3 changed files with 148 additions and 202 deletions
  1. 92 116
      lib/docs/filters/php/entries.rb
  2. 19 0
      lib/docs/filters/php/internal_urls.rb
  3. 37 86
      lib/docs/scrapers/php.rb

+ 92 - 116
lib/docs/filters/php/entries.rb

@@ -1,136 +1,112 @@
 module Docs
   class Php
     class EntriesFilter < Docs::EntriesFilter
-      TYPES = {
-      # [name-begin-with]   => [type]
-        'AMQP'              => 'AMQP',
-        'APCIterator'       => 'APC',
-        'CURL'              => 'cURL',
-        'Date'              => 'Date and Time',
-        'DirectoryIterator' => 'Standard PHP Library',
-        'Directory'         => 'Directories',
-        'DOM'               => 'DOM',
-        'Gearman'           => 'Gearman',
-        'Gmagick'           => 'Gmagick',
-        'Http'              => 'HTTP',
-        'Imagick'           => 'Imagick',
-        'Collator'          => 'Internationalization',
-        'NumberFormatter'   => 'Internationalization',
-        'Locale'            => 'Internationalization',
-        'MessageFormatter'  => 'Internationalization',
-        'Normalizer'        => 'Internationalization',
-        'Intl'              => 'Internationalization',
-        'intl'              => 'Internationalization',
-        'ResourceBundle'    => 'Internationalization',
-        'Spoofchecker'      => 'Internationalization',
-        'Transliterator'    => 'Internationalization',
-        'UConverter'        => 'Internationalization',
-        'grapheme'          => 'Internationalization',
-        'idn'               => 'Internationalization',
-        'Json'              => 'JSON',
-        'mysqli'            => 'mysqli',
-        'OAuth'             => 'OAuth',
-        'PDO'               => 'PDO',
-        'Thread'            => 'pthreads',
-        'Worker'            => 'pthreads',
-        'Stackable'         => 'pthreads',
-        'Mutex'             => 'pthreads',
-        'Cond'              => 'pthreads',
-        'Exception'         => 'Predefined Exceptions',
-        'ErrorException'    => 'Predefined Exceptions',
-        'QuickHash'         => 'QuickHash',
-        'Reflection'        => 'Reflection',
-        'Reflector'         => 'Reflection',
-        'Session'           => 'Sessions',
-        'SimpleXML'         => 'SimpleXML',
-        'Soap'              => 'SOAP',
-        'Solr'              => 'Solr',
-        'Sphinx'            => 'Sphinx',
-        'Spl'               => 'Standard PHP Library',
-        'ArrayObject'       => 'Standard PHP Library',
-        'Countable'         => 'Standard PHP Library',
-        'SQLite3'           => 'SQLite3',
-        'streamWrapper'     => 'Streams',
-        'php_user_filter'   => 'Streams',
-        'tidy'              => 'Tidy',
-        'V8Js'              => 'V8js',
-        'Varnish'           => 'Varnish',
-        'Weakref'           => 'Weak References',
-        'WeakRef'           => 'Weak References',
-        'WeakMap'           => 'Weak References',
-        'XSLTProcessor'     => 'XSLT',
-        'XsltProcessor'     => 'XSLT',
-        'Yaf'               => 'Yaf',
-        'ZipArchive'        => 'Zip' }
+      TYPE_BY_NAME_STARTS_WITH = {
+        'ArrayObject'     => 'SPL',
+        'Cond'            => 'pthreads',
+        'CURL'            => 'cURL',
+        'Date'            => 'Date/Time',
+        'ErrorException'  => 'Predefined Exceptions',
+        'Exception'       => 'Predefined Exceptions',
+        'Json'            => 'JSON',
+        'Http'            => 'HTTP',
+        'Mutex'           => 'pthreads',
+        'php_user_filter' => 'Stream',
+        'Reflector'       => 'Reflection',
+        'Soap'            => 'SOAP',
+        'SplFile'         => 'SPL/File',
+        'SplTempFile'     => 'SPL/File',
+        'Spl'             => 'SPL',
+        'Stackable'       => 'pthreads',
+        'streamWrapper'   => 'Stream',
+        'Thread'          => 'pthreads',
+        'tidy'            => 'Tidy',
+        'Worker'          => 'pthreads',
+        'XsltProcessor'   => 'XSLT',
+        'ZipArchive'      => 'Zip' }
+
+      %w(APC AMQP Directory DOM Gearman Gmagick Imagick mysqli OAuth PDO
+         Reflection Session SimpleXML Solr Sphinx SQLite3 Varnish XSLT Yaf).each do |str|
+        TYPE_BY_NAME_STARTS_WITH[str] = str
+      end
+
+      %w(ArrayAccess Closure Generator Iterator IteratorAggregate Serializable Traversable).each do |str|
+        TYPE_BY_NAME_STARTS_WITH[str] = 'Predefined Interfaces and Classes'
+      end
+
+      %w(Collator grapheme idn Intl intl Locale MessageFormatter Normalizer
+         NumberFormatter ResourceBundle Spoofchecker Transliterator UConverter).each do |str|
+        TYPE_BY_NAME_STARTS_WITH[str] = 'Internationalization'
+      end
+
+      %w(Countable OuterIterator RecursiveIterator SeekableIterator ).each do |str|
+        TYPE_BY_NAME_STARTS_WITH[str] = 'SPL/Interfaces'
+      end
 
       REPLACE_TYPES = {
-      # [original-type]     => [new-type]
-        'Array'             => 'Arrays',
-        'Bzip2'             => 'bzip2',
-        'Classes/Object'    => 'Classes and Objects',
-        'Date/Time'         => 'Date and Time',
-        'Directory'         => 'Directories',
-        'Exceptions'        => 'Standard PHP Library',
-        'Function handling' => 'Function Handling',
-        'GD and Image'      => 'GD',
-        'Gettext'           => 'gettext',
-        'Inotify'           => 'inotify',
-        'Interfaces'        => 'Standard PHP Library',
-        'Iterators'         => 'Standard PHP Library',
-        'Libevent'          => 'libevent',
-        'Mailparse'         => 'Mail',
-        'Misc.'             => 'Miscellaneous',
-        'Multibyte String'  => 'Multibyte Strings',
-        'PCRE'              => 'Regular Expressions',
-        'PHP Options/Info'  => 'Options and Info',
-        'POSIX Regex'       => 'Regular Expressions',
-        'Program execution' => 'Program Execution',
+        'Exceptions'        => 'SPL/Exceptions',
+        'GD and Image'      => 'Image',
+        'Gmagick'           => 'Image/GraphicsMagick',
+        'Imagick'           => 'Image/ImageMagick',
+        'Interfaces'        => 'SPL/Interfaces',
+        'Iterators'         => 'SPL/Iterators',
+        'mysqli'            => 'Database/MySQL',
+        'PostgreSQL'        => 'Database/PostgreSQL',
         'Session'           => 'Sessions',
-        'Session PgSQL'     => 'PostgreSQL',
-        'SPL'               => 'Standard PHP Library',
-        'Statistic'         => 'Statistics',
+        'Session PgSQL'     => 'Database/PostgreSQL',
+        'SQLite3'           => 'Database/SQLite',
+        'SQLSRV'            => 'Database/SQL Server',
         'Stream'            => 'Streams',
-        'String'            => 'Strings',
-        'Variable handling' => 'Variable Handling',
-        'XMLReader'         => 'XML Reader',
-        'XMLWriter'         => 'XML Writer',
-        'Yaml'              => 'YAML',
-        'Zlib'              => 'zlib' }
-
-      IGNORE_SLUGS = %w(reserved.exceptions reserved.interfaces
-        reserved.variables)
+        'Yaml'              => 'YAML' }
 
-      def include_default_entry?
-        !(slug.start_with?('book') || IGNORE_SLUGS.include?(slug))
-      end
+      TYPE_GROUPS = {
+        'Classes and Functions' => ['Classes/Object', 'Function handling', 'Predefined Interfaces and Classes', 'runkit'],
+        'Encoding'              => ['Gettext', 'iconv', 'Multibyte String'],
+        'Compression'           => ['Bzip2', 'Zip', 'Zlib'],
+        'Cryptography'          => ['Hash', 'Mcrypt', 'OpenSSL', 'Password Hashing'],
+        'Database'              => ['DBA', 'ODBC', 'PDO'],
+        'Date and Time'         => ['Calendar', 'Date/Time'],
+        'Errors'                => ['Error Handling', 'Predefined Exceptions'],
+        'File System'           => ['Directory', 'Fileinfo', 'Filesystem', 'Inotify'],
+        'HTML'                  => ['DOM', 'Tidy'],
+        'Language'              => ['Control Structures', 'Misc.', 'PHP Options/Info', 'Predefined Variables'],
+        'Mail'                  => ['Mail', 'Mailparse'],
+        'Mathematics'           => ['BC Math', 'Math', 'Statistic'],
+        'Networking'            => ['GeoIP', 'Network', 'Output Control', 'SSH2', 'Socket', 'URL'],
+        'Process Control'       => ['Eio', 'Libevent', 'POSIX', 'Program execution', 'pthreads'],
+        'String'                => ['Ctype', 'PCRE', 'POSIX Regex', 'Taint'],
+        'Variables'             => ['Filter', 'Variable handling'],
+        'XML'                   => ['libxml', 'SimpleXML', 'XML Parser', 'XML-RPC', 'XMLReader', 'XMLWriter', 'XSLT'] }
 
       def get_name
+        return 'IntlException' if slug == 'class.intlexception'
         name = css('> .sect1 > .title', 'h1', 'h2').first.content
-
-        if name == 'Exception class for intl errors'
-          'IntlException'
-        else
-          name.sub! 'The ', ''
-          name.sub! ' class', ' (class)'
-          name.sub! ' interface', ' (interface)'
-          name
-        end
+        name.sub! 'The ', ''
+        name.sub! ' class', ' (class)'
+        name.sub! ' interface', ' (interface)'
+        name
       end
 
       def get_type
-        if key = TYPES.keys.detect { |t| name.start_with?(t) }
-          TYPES[key]
-        else
-          type = at_css('.up').content.strip
-          type.sub! ' Functions', ''
-          type.sub! ' Obsolete Aliases and', ''
+        type = at_css('.up').content.strip
+        type = 'SPL/Iterators' if type.end_with? 'Iterator'
+        type.sub! ' Functions', ''
+
+        TYPE_BY_NAME_STARTS_WITH.each_pair do |key, value|
+          break type = value if name.start_with?(key)
+        end
 
-          if type.end_with? 'Iterator'
-            'Standard PHP Library'
-          else
-            REPLACE_TYPES[type] || type
+        TYPE_GROUPS.each_pair do |replacement, types|
+          types.each do |t|
+            return replacement if type == t
           end
         end
+
+        REPLACE_TYPES[type] || type
+      end
+
+      def include_default_entry?
+        Php::INDEX_PATHS.exclude?(subpath) && doc.at_css('.reference', '.refentry', '.sect1')
       end
     end
   end

+ 19 - 0
lib/docs/filters/php/internal_urls.rb

@@ -0,0 +1,19 @@
+module Docs
+  class Php
+    class InternalUrlsFilter < Filter
+      def call
+        if subpath.start_with?('book.') || subpath.start_with?('class.')
+          result[:internal_urls] = internal_urls
+        end
+        doc
+      end
+
+      def internal_urls
+        css('.book a', '.chunklist a').inject [] do |urls, link|
+          urls << link['href'] if link.next.try(:text?) && link['href'].exclude?('ref.pdo-')
+          urls
+        end
+      end
+    end
+  end
+end

+ 37 - 86
lib/docs/scrapers/php.rb

@@ -1,111 +1,62 @@
 module Docs
   class Php < FileScraper
-    # WARNING: if you are the kind of developer who likes to automate things,
-    # this scraper will hurt your feelings.
-
     self.name = 'PHP'
     self.type = 'php'
     self.version = 'up to 5.5.6'
     self.base_url = 'http://www.php.net/manual/en/'
-    self.root_path = 'extensions.alphabetical.html'
+    self.root_path = 'index.html'
 
     # Downloaded from php.net/download-docs.php
     self.dir = '/Users/Thibaut/DevDocs/Docs/PHP'
 
-    html_filters.push 'php/entries', 'php/clean_html', 'title'
+    html_filters.push 'php/internal_urls', 'php/entries', 'php/clean_html', 'title'
     text_filters.push 'php/fix_urls'
 
     options[:title] = false
     options[:root_title] = 'PHP: Hypertext Preprocessor'
 
-    options[:only] = [] # using a whitelist
-
-    options[:only_patterns] = [/\Afunction\.\w+\.html\z/,
-      /\Areserved\.exceptions/, /\Areserved\.interfaces/,
-      /\Areserved\.variables/, /\Acontrol\-structures/]
+    INDEX_PATHS = %w(
+      index.html
+      funcref.html
+      refs.database.html
+      set.mysqlinfo.html
+      language.control-structures.html
+      reserved.exceptions.html
+      reserved.interfaces.html
+      reserved.variables.html)
 
-    # TODO: MongoDB, Phar
-    BOOKS = %w(amqp apache apc array bc bzip2 calendar classkit classobj com
-      ctype curl datetime dba dir dom eio errorfunc exec fileinfo filesystem
-      filter ftp funchand gearman geoip gettext gmagick hash http iconv iisfunc
-      image imagick imap info inotify intl json ldap libevent libxml mail
-      mailparse math mbstring mcrypt memcached misc mysqli network oauth
-      openssl outcontrol password pcre pdo pgsql posix pthreads quickhash
-      readline regex runkit reflection session session-pgsql simplexml soap
-      sockets solr sphinx spl spl-types sqlite3 sqlsrv ssh2 stats stream
-      strings taint tidy url v8js var varnish weakref xml xmlreader xmlrpc
-      xmlwriter xsl yaf yaml zip zlib uodbc)
-    options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
-    options[:only_patterns].concat BOOKS.map { |s| /\Afunction\.#{s}(?:\.|\-)/ }
+    options[:skip_links] = ->(filter) do
+      INDEX_PATHS.exclude?(filter.subpath)
+    end
 
-    CLASSES = %w(apciterator curlfile dateinterval dateperiod collator
-      numberformatter locale normalizer messageformatter resourcebundle
-      spoofchecker transliterator uconverter memcached thread worker stackable
-      mutex cond runkit reflector sessionhandler sessionhandlerinterface
-      sphinxclient countable arrayobject streamwrapper xmlreader xsltprocessor
-      ziparchive exception errorexception)
-    options[:only].concat CLASSES.map { |s| "class.#{s}.html" }
-    options[:only_patterns].concat CLASSES.map { |s| /\A#{s}\./ }
+    options[:only] = INDEX_PATHS.dup
 
-    FUNCTION_PREFIXES = %w(assert base base64 cal call chunk class cli
-      connection convert count create date debug define disk dns easter ereg
-      eregi error event file finfo forward func gc gd get grapheme halt header
-      headers highlight html http idn iis in inet ini is iterator magic mb md5
-      mdecrypt memory mime move mt nl ob output parse pg php preg print proc
-      quoted realpath register restore set sha1 shell show stream socket spl
-      str sys tidy time timezone unregister use utf8 variant xml)
-    options[:only_patterns].concat FUNCTION_PREFIXES.map { |s| /\Afunction\.#{s}\-/ }
+    options[:only_patterns] = [
+      /\Aclass\./,
+      /\Afunction\./,
+      /\Acontrol-structures/,
+      /\Areserved\.exceptions/,
+      /\Areserved\.interfaces/,
+      /\Areserved\.variables/]
 
-    FUNCTIONS = %w(trigger-error user-error require-once include-once)
-    options[:only].concat FUNCTIONS.map { |s| "function.#{s}.html" }
+    BOOKS = %w(amqp apache apc array bc bzip2 calendar classobj ctype curl
+      datetime dba dir dom eio errorfunc exec fileinfo filesystem filter ftp
+      funchand gearman geoip gettext gmagick hash http iconv iisfunc image
+      imagick imap info inotify intl json ldap libevent libxml mail mailparse
+      math mbstring mcrypt memcached misc mysqli network oauth openssl
+      outcontrol password pcre pdo pgsql posix pthreads regex runkit reflection
+      session session-pgsql simplexml soap sockets solr sphinx spl spl-types
+      sqlite3 sqlsrv ssh2 stats stream strings taint tidy uodbc url var varnish
+      xml xmlreader xmlrpc xmlwriter xsl yaf yaml zip zlib)
 
-    options[:only_patterns].concat [
-      /function\.\w+\-exists\.html\z/,
-      /\A\w+iterator\./,
-      /\Afunction\.bz\w+\.html\z/,
-      /\Aclass\.\w+iterator\.html\z/,
-      /\Aclass\.\w+exception\.html\z/,
-      /\Aclass\.amqp/, /\Aamqp/,
-      /\Aclass\.datetime/, /\Adatetime/,
-      /\Aclass\.dom/, /\Adom/,
-      /\Aclass\.gearman/, /\Agearman/,
-      /\Aclass\.gmagick/, /\Agmagick/,
-      /\Aclass\.http/, /\Ahttp/,
-      /\Aclass\.imagick/, /\Aimagick/,
-      /\Aclass\.intl/, /\Aintl/,
-      /\Aclass\.json/, /\Ajson/,
-      /\Aclass\.mysqli/, /\Amysqli/,
-      /\Aclass\.oauth/, /\Aoauth/,
-      /\Aclass\.pdo/, /\Apdo/,
-      /\Aclass\.quickhash/, /\Aquickhash/,
-      /\Aclass\.reflection/, /\Areflection/,
-      /\Aclass\.simplexml/, /\Asimplexml/,
-      /\Aclass\.soap/, /\Asoap/,
-      /\Aclass\.solr/, /\Asolr/,
-      /\Aclass\.spl/, /\Aspl/,
-      /\Aclass\.sqlite3/, /\Asqlite3/,
-      /\Aclass\.tidy/, /\Atidy/,
-      /\Aclass\.v8js/, /\Av8js/,
-      /\Aclass\.varnish/, /\Avarnish/,
-      /\Aclass\.weak/, /\Aweak/,
-      /\Aclass\.yaf\-/, /\Ayaf\-/]
+    options[:only].concat BOOKS.map { |s| "book.#{s}.html" }
 
-    options[:skip_patterns] = [/example/, /quickstart/, /\.setup\.html\z/,
-      /\.overview\.html\z/, /\.requirements\.html\z/, /\.installation\.html\z/,
-      /\.install\.html\z/, /\.configuration\.html\z/, /\.resources\.html\z/,
-      /\.constants\.html\z/, /\Amysqlinfo/, /\Adatetime\.formats/]
+    options[:skip] = %w(
+      control-structures.intro.html
+      control-structures.alternative-syntax.html
+      function.mssql-select-db.html)
 
-    options[:skip] = %w(control-structures.intro.html
-      control-structures.alternative-syntax.html memcached.expiration.html
-      memcached.callbacks.html memcached.callbacks.result.html
-      memcached.callbacks.read-through.html memcached.sessions.html
-      mysqli.persistconns.html mysqli.notes.html mysqli.summary.html
-      pdo.connections.html pdo.transactions.html pdo.prepared-statements.html
-      pdo.error-handling.html pdo.lobs.htm pdo.drivers.html
-      reflection.extending.html http.request.options.html
-      class.lapackexception.html class.snmpexception.html function.mhash.html
-      spl.datastructures.html spl.iterators.html spl.interfaces.html
-      spl.exceptions.html spl.files.html spl.misc.html)
+    options[:skip_patterns] = [/mysqlnd/]
 
     options[:attribution] = <<-HTML
       &copy; 1997&ndash;2013 The PHP Documentation Group<br>