require 'test_helper' require 'docs' class DocsScraperTest < MiniTest::Spec class Scraper < Docs::Scraper self.type = 'scraper' self.base_url = 'http://example.com/' self.root_path = '/root' self.initial_paths = ['/initial'] self.html_filters = Docs::FilterStack.new self.text_filters = Docs::FilterStack.new end let :scraper do Scraper.new.tap do |scraper| scraper.extend FakeInstrumentation end end let :response do Struct.new(:body, :url).new end describe ".inherited" do let :subclass do Class.new Scraper end it "sets .type" do assert_equal Scraper.type, subclass.type end it "sets .root_path" do assert_equal Scraper.root_path, subclass.root_path end it "duplicates .initial_paths" do stub(Scraper).initial_paths { ['path'] } assert_equal Scraper.initial_paths, subclass.initial_paths refute_same Scraper.initial_paths, subclass.initial_paths end it "duplicates .options" do stub(Scraper).options { { test: [] } } assert_equal Scraper.options, subclass.options refute_same Scraper.options, subclass.options refute_same Scraper.options[:test], subclass.options[:test] end it "duplicates .html_filters" do assert_equal Scraper.html_filters, subclass.html_filters refute_same Scraper.html_filters, subclass.html_filters end it "duplicates .text_filters" do assert_equal Scraper.text_filters, subclass.text_filters refute_same Scraper.text_filters, subclass.text_filters end end describe ".filters" do it "returns the union of .html_filters and .text_filters" do stub(Scraper.html_filters).to_a { [1] } stub(Scraper.text_filters).to_a { [2] } assert_equal [1, 2], Scraper.filters end end describe "#root_path?" do it "returns false when .root_path is blank" do stub(Scraper).root_path { '' } refute scraper.root_path? end it "returns false when .root_path is '/'" do stub(Scraper).root_path { '/' } refute scraper.root_path? end it "returns true when .root_path is '/path'" do stub(Scraper).root_path { '/path' } assert scraper.root_path? end end describe "#root_url" do let :root_url do scraper.root_url end context "when #root_path? is false" do before do stub(scraper).root_path? { false } end it "returns a memoized Docs::URL" do assert_instance_of Docs::URL, root_url assert_same root_url, scraper.root_url end it "returns the normalized .base_url" do stub(Scraper).base_url { 'http://example.com' } assert_equal 'http://example.com/', root_url.to_s end end context "when #root_path? is true" do before do stub(scraper).root_path? { true } end it "returns a memoized Docs::URL" do assert_instance_of Docs::URL, root_url assert_same root_url, scraper.root_url end it "returns .base_url + .root_path" do stub(Scraper).base_url { 'http://example.com/path/' } stub(Scraper).root_path { '/root' } assert_equal 'http://example.com/path/root', root_url.to_s end end end describe "#initial_urls" do let :initial_urls do scraper.initial_urls end it "returns a frozen, memoized Array" do assert_instance_of Array, initial_urls assert initial_urls.frozen? assert_same initial_urls, scraper.initial_urls end it "includes the #root_url" do assert_includes initial_urls, scraper.root_url.to_s end it "includes the .initial_paths converted to urls" do stub(Scraper).base_url { 'http://example.com/' } stub(Scraper).initial_paths { ['one', '/two'] } assert_includes initial_urls, 'http://example.com/one' assert_includes initial_urls, 'http://example.com/two' end end describe "#build_page" do before do stub(scraper).handle_response end it "requires a path" do assert_raises ArgumentError do scraper.build_page end end context "with a blank path" do it "requests the root url" do mock(scraper).request_one(scraper.root_url.to_s) scraper.build_page '' end end context "with '/'" do it "requests the root url" do mock(scraper).request_one(scraper.root_url.to_s) scraper.build_page '/' end end context "with '/file'" do it "requests 'example.com/file' when the base url is 'example.com" do stub(Scraper).base_url { 'http://example.com' } mock(scraper).request_one 'http://example.com/file' scraper.build_page '/file' end it "requests 'example.com/file' when the base url is 'example.com/" do stub(Scraper).base_url { 'http://example.com/' } mock(scraper).request_one 'http://example.com/file' scraper.build_page '/file' end end it "returns the processed response" do stub(scraper).request_one { response } mock(scraper).handle_response(response) { 'test' } assert_equal 'test', scraper.build_page('') end it "yields the processed response" do stub(scraper).request_one { response } stub(scraper).handle_response(response) { 'test' } scraper.build_page('') { |arg| @arg = arg } assert @arg assert_equal 'test', @arg end end describe "#build_pages" do let :block do Proc.new {} end let :processed_response do Hash.new end it "requests the #initial_urls" do mock(scraper).request_all(scraper.initial_urls) scraper.build_pages(&block) end it "instruments 'running'" do stub(scraper).request_all scraper.build_pages(&block) assert scraper.last_instrumentation assert_equal 'running.scraper', scraper.last_instrumentation[:event] assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls] end context "when the response is processable" do before do stub(scraper).request_all do |urls, block| urls.each { |url| @next_urls ||= block.call(response) } end stub(scraper).handle_response(response) { processed_response } end it "yields the processed response" do scraper.build_pages { |arg| @arg = arg } assert_same processed_response, @arg end context "when :internal_urls is empty" do before do processed_response[:internal_urls] = [] end it "requests nothing more" do scraper.build_pages(&block) assert_nil @next_urls end it "doesn't instrument 'queued'" do scraper.build_pages(&block) refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event) end end context "when :internal_urls isn't empty" do let :internal_urls do ['Url'] end before do processed_response[:internal_urls] = internal_urls end it "requests the urls" do scraper.build_pages(&block) assert_equal internal_urls, @next_urls end it "doesn't request the same url twice irrespective of case" do processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase) scraper.build_pages(&block) assert_empty @next_urls end it "instruments 'queued'" do scraper.build_pages(&block) assert scraper.last_instrumentation assert_equal 'queued.scraper', scraper.last_instrumentation[:event] assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls] end end end context "when the response isn't processable" do it "doesn't yield" do stub(scraper).request_all.yields(response) stub(scraper).handle_response(response) { nil } scraper.build_pages { @yield = true } refute @yield end end end describe "#options" do let :options do Hash.new end let :result do scraper.options end before do stub(Scraper).options { options } end it "returns a frozen, memoized Hash" do assert_instance_of Hash, result assert result.frozen? assert_same result, scraper.options end it "includes .options" do options[:test] = true assert result[:test] end it "includes #base_url" do assert_equal scraper.base_url, result[:base_url] end it "includes #root_url" do assert_equal scraper.root_url, result[:root_url] end it "includes #root_path" do assert_equal '/root', result[:root_path] end it "includes #initial_paths" do assert_equal ['/initial'], result[:initial_paths] end it "adds #initial_paths to :only when it is an array" do options[:only] = ['/path'] assert_includes result[:only], options[:only].first assert_includes result[:only], '/initial' end it "adds #initial_paths to :only when :only_patterns is an array" do options[:only_patterns] = [] assert_includes result[:only], '/initial' end it "doesn't modify :only in place" do options[:only] = [] result assert_empty options[:only] end context "when #root_path? is false" do before do stub(scraper).root_path? { false } end it "doesn't modify :skip" do options[:skip] = [] assert_equal options[:skip], result[:skip] end it "adds '' and '/' to :only when it is an array" do options[:only] = ['/path'] assert_includes result[:only], options[:only].first assert_includes result[:only], '' assert_includes result[:only], '/' end it "adds '' and '/' to :only when :only_patterns is an array" do options[:only_patterns] = [] assert_includes result[:only], '' assert_includes result[:only], '/' end it "doesn't modify :only in place" do options[:only] = [] result assert_empty options[:only] end end context "when #root_path? is true" do before do stub(scraper).root_path? { true } end it "adds '' and '/' to :skip when it is nil" do assert_includes result[:skip], '' assert_includes result[:skip], '/' end it "adds '' and '/' to :skip when it is an array" do options[:skip] = ['/path'] assert_includes result[:skip], options[:skip].first assert_includes result[:skip], '' assert_includes result[:skip], '/' end it "doesn't modify :skip in place" do options[:skip] = [] result assert_empty options[:skip] end it "adds #root_path to :only when it is an array" do options[:only] = ['/path'] assert_includes result[:only], options[:only].first assert_includes result[:only], '/root' end it "adds #root_path to :only when :only_patterns is an array" do options[:only_patterns] = [] assert_includes result[:only], '/root' end end end describe "#handle_response" do let :result do scraper.send :handle_response, response end context "when the response is processable" do before do stub(scraper).process_response?(response) { true } end it "runs the pipeline" do mock(scraper.pipeline).call.with_any_args result end it "returns the result" do stub(scraper.pipeline).call { |_, _, result| result[:test] = true } assert result[:test] end it "instruments 'process_response'" do result assert scraper.last_instrumentation assert_equal 'process_response.scraper', scraper.last_instrumentation[:event] assert_equal response, scraper.last_instrumentation[:payload][:response] end context "the pipeline document" do it "is the parsed response body" do response.body = 'body' stub(scraper.pipeline).call { |arg| @arg = arg } mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } } result assert_equal 'html', @arg end end context "the pipeline context" do let :context do stub(scraper.pipeline).call { |_, arg| @arg = arg } result @arg end it "includes #options" do stub(scraper).options { { test: true } } assert context[:test] end it "includes the response url" do response.url = 'url' assert_equal 'url', context[:url] end end end context "when the response isn't processable" do before do stub(scraper).process_response?(response) { false } end it "doesn't run the pipeline" do dont_allow(scraper.pipeline).call result end it "returns nil" do assert_nil result end it "instruments 'ignore_response'" do result assert scraper.last_instrumentation assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event] assert_equal response, scraper.last_instrumentation[:payload][:response] end end end describe "#pipeline" do let :pipeline do scraper.pipeline end it "returns a memoized HTML::Pipeline" do assert_instance_of ::HTML::Pipeline, pipeline assert_same pipeline, scraper.pipeline end it "returns a pipeline with the filters stored in .filters" do stub(Scraper).filters { [1] } assert_equal Scraper.filters, pipeline.filters end it "returns a pipeline with Docs as instrumentation service" do assert_equal Docs, pipeline.instrumentation_service end end end