Browse Source

Add image scraping and optimization filter

Rel: #633
Thibaut Courouble 8 years ago
parent
commit
a6855329e8

+ 24 - 0
.image_optim.yml

@@ -0,0 +1,24 @@
+verbose: false
+skip_missing_workers: true
+allow_lossy: true
+advpng: false
+gifsicle:
+  interlace: false
+  level: 3
+  careful: true
+jhead: false
+jpegoptim:
+  strip: all
+  max_quality: 100
+jpegrecompress: false
+jpegtran: false
+optipng:
+  level: 3
+  interlace: false
+  strip: true
+pngcrush: false
+pngout: false
+pngquant:
+  quality: !ruby/range 80..99
+  speed: 3
+svgo: false

+ 2 - 0
Gemfile

@@ -32,6 +32,8 @@ group :docs do
   gem 'typhoeus'
   gem 'nokogiri'
   gem 'html-pipeline'
+  gem 'image_optim'
+  gem 'image_optim_pack', platforms: :ruby
   gem 'progress_bar', require: false
   gem 'unix_utils', require: false
   gem 'tty-pager', require: false

+ 16 - 0
Gemfile.lock

@@ -25,12 +25,25 @@ GEM
       ffi (>= 1.3.0)
     eventmachine (1.2.3)
     execjs (2.7.0)
+    exifr (1.3.1)
     ffi (1.9.18)
+    fspath (3.1.0)
     highline (1.7.8)
     html-pipeline (2.6.0)
       activesupport (>= 2)
       nokogiri (>= 1.4)
     i18n (0.8.4)
+    image_optim (0.25.0)
+      exifr (~> 1.2, >= 1.2.2)
+      fspath (~> 3.0)
+      image_size (~> 1.5)
+      in_threads (~> 1.3)
+      progress (~> 3.0, >= 3.0.1)
+    image_optim_pack (0.5.0.20170712)
+      fspath (>= 2.1, < 4)
+      image_optim (~> 0.19)
+    image_size (1.5.0)
+    in_threads (1.4.0)
     method_source (0.8.2)
     mini_portile2 (2.2.0)
     minitest (5.10.2)
@@ -39,6 +52,7 @@ GEM
     nokogiri (1.8.0)
       mini_portile2 (~> 2.2.0)
     options (2.3.2)
+    progress (3.3.1)
     progress_bar (1.1.0)
       highline (~> 1.6)
       options (~> 2.3.0)
@@ -109,6 +123,8 @@ DEPENDENCIES
   coffee-script
   erubi
   html-pipeline
+  image_optim
+  image_optim_pack
   minitest
   nokogiri
   progress_bar

+ 1 - 0
lib/docs/core/requester.rb

@@ -20,6 +20,7 @@ module Docs
     def initialize(options = {})
       @request_options = options.extract!(:request_options)[:request_options].try(:dup) || {}
       options[:max_concurrency] ||= 20
+      options[:pipelining] = 0
       super
     end
 

+ 72 - 0
lib/docs/filters/core/images.rb

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+module Docs
+  class ImagesFilter < Filter
+    include Instrumentable
+
+    def self.optimize_image_data(data)
+      @image_optim ||= ImageOptim.new
+      @image_optim.optimize_image_data(data)
+    end
+
+    def call
+      @@cache ||= {}
+
+      doc.css('img[src]').each do |node|
+        src = node['src']
+
+        if @@cache.key?(src)
+          node['src'] = @@cache[src] unless @@cache[src] == false
+          next
+        end
+
+        @@cache[src] = false
+
+        url = Docs::URL.parse(src)
+        url.scheme = 'https' if url.scheme.nil?
+        next unless url.scheme == 'http' || url.scheme == 'https'
+
+        begin
+          Request.run(url) do |response|
+            unless response.success?
+              instrument 'broken.image', url: url, status: response.code
+              next
+            end
+
+            unless response.mime_type.start_with?('image/')
+              instrument 'invalid.image', url: url, content_type: response.mime_type
+              next
+            end
+
+            image = response.body
+
+            unless context[:optimize_images] == false
+              image = self.class.optimize_image_data(image) || image
+            end
+
+            size = image.bytesize
+
+            if size > max_size
+              instrument 'too_big.image', url: url, size: size
+              next
+            end
+
+            image = Base64.strict_encode64(image)
+            image.prepend "data:#{response.mime_type};base64,"
+            node['src'] = @@cache[src] = image
+          end
+        rescue => exception
+          instrument 'error.image', url: url, exception: exception
+        end
+      end
+
+      doc
+    end
+
+    private
+
+    def max_size
+      @max_size ||= context[:max_image_size] || 100.kilobytes
+    end
+  end
+end

+ 27 - 0
lib/docs/subscribers/image_subscriber.rb

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+
+module Docs
+  class ImageSubscriber < Subscriber
+    self.namespace = 'image'
+
+    def broken(event)
+      log "Skipped broken image (#{event.payload[:code]}): #{event.payload[:url]}"
+    end
+
+    def invalid(event)
+      log "Skipped invalid image (#{event.payload[:content_type]}): #{event.payload[:url]}"
+    end
+
+    def too_big(event)
+      log "Skipped large image (#{(event.payload[:size] / 1.kilobyte.to_f).round} KB): #{event.payload[:url]}"
+    end
+
+    def error(event)
+      exception = event.payload[:exception]
+      log "ERROR: #{event.payload[:url]}"
+      puts "  #{exception.class}: #{exception.message.gsub("\n", "\n    ")}"
+      puts exception.backtrace.select { |line| line.start_with?(Docs.root_path) }.join("\n  ").prepend("\n  ")
+      puts "\n"
+    end
+  end
+end

+ 2 - 1
lib/tasks/docs.thor

@@ -35,6 +35,7 @@ class DocsCLI < Thor
       return puts 'ERROR: [path] must be an absolute path.'
     end
 
+    Docs.install_report :image
     Docs.install_report :store if options[:verbose]
     if options[:debug]
       GC.disable
@@ -61,7 +62,7 @@ class DocsCLI < Thor
     Docs.rescue_errors = true
     Docs.install_report :store if options[:verbose]
     Docs.install_report :scraper if options[:debug]
-    Docs.install_report :progress_bar, :doc if $stdout.tty?
+    Docs.install_report :progress_bar, :doc, :image if $stdout.tty?
 
     require 'unix_utils' if options[:package]