| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513 |
- require 'test_helper'
- require 'docs'
- class DocsScraperTest < MiniTest::Spec
- class Scraper < Docs::Scraper
- self.type = 'scraper'
- self.base_url = 'http://example.com/'
- self.root_path = '/root'
- self.initial_paths = ['/initial']
- self.html_filters = Docs::FilterStack.new
- self.text_filters = Docs::FilterStack.new
- end
- let :scraper do
- Scraper.new.tap do |scraper|
- scraper.extend FakeInstrumentation
- end
- end
- let :response do
- Struct.new(:body, :url).new
- end
- describe ".inherited" do
- let :subclass do
- Class.new Scraper
- end
- it "sets .type" do
- assert_equal Scraper.type, subclass.type
- end
- it "sets .root_path" do
- assert_equal Scraper.root_path, subclass.root_path
- end
- it "duplicates .initial_paths" do
- stub(Scraper).initial_paths { ['path'] }
- assert_equal Scraper.initial_paths, subclass.initial_paths
- refute_same Scraper.initial_paths, subclass.initial_paths
- end
- it "duplicates .options" do
- stub(Scraper).options { { test: [] } }
- assert_equal Scraper.options, subclass.options
- refute_same Scraper.options, subclass.options
- refute_same Scraper.options[:test], subclass.options[:test]
- end
- it "duplicates .html_filters" do
- assert_equal Scraper.html_filters, subclass.html_filters
- refute_same Scraper.html_filters, subclass.html_filters
- end
- it "duplicates .text_filters" do
- assert_equal Scraper.text_filters, subclass.text_filters
- refute_same Scraper.text_filters, subclass.text_filters
- end
- end
- describe ".filters" do
- it "returns the union of .html_filters and .text_filters" do
- stub(Scraper.html_filters).to_a { [1] }
- stub(Scraper.text_filters).to_a { [2] }
- assert_equal [1, 2], Scraper.filters
- end
- end
- describe "#root_path?" do
- it "returns false when .root_path is blank" do
- stub(Scraper).root_path { '' }
- refute scraper.root_path?
- end
- it "returns false when .root_path is '/'" do
- stub(Scraper).root_path { '/' }
- refute scraper.root_path?
- end
- it "returns true when .root_path is '/path'" do
- stub(Scraper).root_path { '/path' }
- assert scraper.root_path?
- end
- end
- describe "#root_url" do
- let :root_url do
- scraper.root_url
- end
- context "when #root_path? is false" do
- before do
- stub(scraper).root_path? { false }
- end
- it "returns a memoized Docs::URL" do
- assert_instance_of Docs::URL, root_url
- assert_same root_url, scraper.root_url
- end
- it "returns the normalized .base_url" do
- stub(Scraper).base_url { 'http://example.com' }
- assert_equal 'http://example.com/', root_url.to_s
- end
- end
- context "when #root_path? is true" do
- before do
- stub(scraper).root_path? { true }
- end
- it "returns a memoized Docs::URL" do
- assert_instance_of Docs::URL, root_url
- assert_same root_url, scraper.root_url
- end
- it "returns .base_url + .root_path" do
- stub(Scraper).base_url { 'http://example.com/path/' }
- stub(Scraper).root_path { '/root' }
- assert_equal 'http://example.com/path/root', root_url.to_s
- end
- end
- end
- describe "#initial_urls" do
- let :initial_urls do
- scraper.initial_urls
- end
- it "returns a frozen, memoized Array" do
- assert_instance_of Array, initial_urls
- assert initial_urls.frozen?
- assert_same initial_urls, scraper.initial_urls
- end
- it "includes the #root_url" do
- assert_includes initial_urls, scraper.root_url.to_s
- end
- it "includes the .initial_paths converted to urls" do
- stub(Scraper).base_url { 'http://example.com/' }
- stub(Scraper).initial_paths { ['one', '/two'] }
- assert_includes initial_urls, 'http://example.com/one'
- assert_includes initial_urls, 'http://example.com/two'
- end
- end
- describe "#build_page" do
- before do
- stub(scraper).handle_response
- end
- it "requires a path" do
- assert_raises ArgumentError do
- scraper.build_page
- end
- end
- context "with a blank path" do
- it "requests the root url" do
- mock(scraper).request_one(scraper.root_url.to_s)
- scraper.build_page ''
- end
- end
- context "with '/'" do
- it "requests the root url" do
- mock(scraper).request_one(scraper.root_url.to_s)
- scraper.build_page '/'
- end
- end
- context "with '/file'" do
- it "requests 'example.com/file' when the base url is 'example.com" do
- stub(Scraper).base_url { 'http://example.com' }
- mock(scraper).request_one 'http://example.com/file'
- scraper.build_page '/file'
- end
- it "requests 'example.com/file' when the base url is 'example.com/" do
- stub(Scraper).base_url { 'http://example.com/' }
- mock(scraper).request_one 'http://example.com/file'
- scraper.build_page '/file'
- end
- end
- it "returns the processed response" do
- stub(scraper).request_one { response }
- mock(scraper).handle_response(response) { 'test' }
- assert_equal 'test', scraper.build_page('')
- end
- it "yields the processed response" do
- stub(scraper).request_one { response }
- stub(scraper).handle_response(response) { 'test' }
- scraper.build_page('') { |arg| @arg = arg }
- assert @arg
- assert_equal 'test', @arg
- end
- end
- describe "#build_pages" do
- let :block do
- Proc.new {}
- end
- let :processed_response do
- Hash.new
- end
- it "requests the #initial_urls" do
- mock(scraper).request_all(scraper.initial_urls)
- scraper.build_pages(&block)
- end
- it "instruments 'running'" do
- stub(scraper).request_all
- scraper.build_pages(&block)
- assert scraper.last_instrumentation
- assert_equal 'running.scraper', scraper.last_instrumentation[:event]
- assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
- end
- context "when the response is processable" do
- before do
- stub(scraper).request_all do |urls, block|
- urls.each { |url| @next_urls ||= block.call(response) }
- end
- stub(scraper).handle_response(response) { processed_response }
- end
- it "yields the processed response" do
- scraper.build_pages { |arg| @arg = arg }
- assert_same processed_response, @arg
- end
- context "when :internal_urls is empty" do
- before do
- processed_response[:internal_urls] = []
- end
- it "requests nothing more" do
- scraper.build_pages(&block)
- assert_nil @next_urls
- end
- it "doesn't instrument 'queued'" do
- scraper.build_pages(&block)
- refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
- end
- end
- context "when :internal_urls isn't empty" do
- let :internal_urls do
- ['Url']
- end
- before do
- processed_response[:internal_urls] = internal_urls
- end
- it "requests the urls" do
- scraper.build_pages(&block)
- assert_equal internal_urls, @next_urls
- end
- it "doesn't request the same url twice irrespective of case" do
- processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
- scraper.build_pages(&block)
- assert_empty @next_urls
- end
- it "instruments 'queued'" do
- scraper.build_pages(&block)
- assert scraper.last_instrumentation
- assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
- assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
- end
- end
- end
- context "when the response isn't processable" do
- it "doesn't yield" do
- stub(scraper).request_all.yields(response)
- stub(scraper).handle_response(response) { nil }
- scraper.build_pages { @yield = true }
- refute @yield
- end
- end
- end
- describe "#options" do
- let :options do
- Hash.new
- end
- let :result do
- scraper.options
- end
- before do
- stub(Scraper).options { options }
- end
- it "returns a frozen, memoized Hash" do
- assert_instance_of Hash, result
- assert result.frozen?
- assert_same result, scraper.options
- end
- it "includes .options" do
- options[:test] = true
- assert result[:test]
- end
- it "includes #base_url" do
- assert_equal scraper.base_url, result[:base_url]
- end
- it "includes #root_url" do
- assert_equal scraper.root_url, result[:root_url]
- end
- it "includes #root_path" do
- assert_equal '/root', result[:root_path]
- end
- it "includes #initial_paths" do
- assert_equal ['/initial'], result[:initial_paths]
- end
- it "adds #initial_paths to :only when it is an array" do
- options[:only] = ['/path']
- assert_includes result[:only], options[:only].first
- assert_includes result[:only], '/initial'
- end
- it "adds #initial_paths to :only when :only_patterns is an array" do
- options[:only_patterns] = []
- assert_includes result[:only], '/initial'
- end
- it "doesn't modify :only in place" do
- options[:only] = []
- result
- assert_empty options[:only]
- end
- context "when #root_path? is false" do
- before do
- stub(scraper).root_path? { false }
- end
- it "doesn't modify :skip" do
- options[:skip] = []
- assert_equal options[:skip], result[:skip]
- end
- it "adds '' and '/' to :only when it is an array" do
- options[:only] = ['/path']
- assert_includes result[:only], options[:only].first
- assert_includes result[:only], ''
- assert_includes result[:only], '/'
- end
- it "adds '' and '/' to :only when :only_patterns is an array" do
- options[:only_patterns] = []
- assert_includes result[:only], ''
- assert_includes result[:only], '/'
- end
- it "doesn't modify :only in place" do
- options[:only] = []
- result
- assert_empty options[:only]
- end
- end
- context "when #root_path? is true" do
- before do
- stub(scraper).root_path? { true }
- end
- it "adds '' and '/' to :skip when it is nil" do
- assert_includes result[:skip], ''
- assert_includes result[:skip], '/'
- end
- it "adds '' and '/' to :skip when it is an array" do
- options[:skip] = ['/path']
- assert_includes result[:skip], options[:skip].first
- assert_includes result[:skip], ''
- assert_includes result[:skip], '/'
- end
- it "doesn't modify :skip in place" do
- options[:skip] = []
- result
- assert_empty options[:skip]
- end
- it "adds #root_path to :only when it is an array" do
- options[:only] = ['/path']
- assert_includes result[:only], options[:only].first
- assert_includes result[:only], '/root'
- end
- it "adds #root_path to :only when :only_patterns is an array" do
- options[:only_patterns] = []
- assert_includes result[:only], '/root'
- end
- end
- end
- describe "#handle_response" do
- let :result do
- scraper.send :handle_response, response
- end
- context "when the response is processable" do
- before do
- stub(scraper).process_response?(response) { true }
- end
- it "runs the pipeline" do
- mock(scraper.pipeline).call.with_any_args
- result
- end
- it "returns the result" do
- stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
- assert result[:test]
- end
- it "instruments 'process_response'" do
- result
- assert scraper.last_instrumentation
- assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
- assert_equal response, scraper.last_instrumentation[:payload][:response]
- end
- context "the pipeline document" do
- it "is the parsed response body" do
- response.body = 'body'
- stub(scraper.pipeline).call { |arg| @arg = arg }
- mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
- result
- assert_equal 'html', @arg
- end
- end
- context "the pipeline context" do
- let :context do
- stub(scraper.pipeline).call { |_, arg| @arg = arg }
- result
- @arg
- end
- it "includes #options" do
- stub(scraper).options { { test: true } }
- assert context[:test]
- end
- it "includes the response url" do
- response.url = 'url'
- assert_equal 'url', context[:url]
- end
- end
- end
- context "when the response isn't processable" do
- before do
- stub(scraper).process_response?(response) { false }
- end
- it "doesn't run the pipeline" do
- dont_allow(scraper.pipeline).call
- result
- end
- it "returns nil" do
- assert_nil result
- end
- it "instruments 'ignore_response'" do
- result
- assert scraper.last_instrumentation
- assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
- assert_equal response, scraper.last_instrumentation[:payload][:response]
- end
- end
- end
- describe "#pipeline" do
- let :pipeline do
- scraper.pipeline
- end
- it "returns a memoized HTML::Pipeline" do
- assert_instance_of ::HTML::Pipeline, pipeline
- assert_same pipeline, scraper.pipeline
- end
- it "returns a pipeline with the filters stored in .filters" do
- stub(Scraper).filters { [1] }
- assert_equal Scraper.filters, pipeline.filters
- end
- it "returns a pipeline with Docs as instrumentation service" do
- assert_equal Docs, pipeline.instrumentation_service
- end
- end
- end
|