scraper_test.rb 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. require 'test_helper'
  2. require 'docs'
  3. class DocsScraperTest < MiniTest::Spec
  4. class Scraper < Docs::Scraper
  5. self.type = 'scraper'
  6. self.base_url = 'http://example.com/'
  7. self.root_path = '/root'
  8. self.initial_paths = ['/initial']
  9. self.html_filters = Docs::FilterStack.new
  10. self.text_filters = Docs::FilterStack.new
  11. end
  12. let :scraper do
  13. Scraper.new.tap do |scraper|
  14. scraper.extend FakeInstrumentation
  15. end
  16. end
  17. let :response do
  18. Struct.new(:body, :url).new
  19. end
  20. describe ".inherited" do
  21. let :subclass do
  22. Class.new Scraper
  23. end
  24. it "sets .type" do
  25. assert_equal Scraper.type, subclass.type
  26. end
  27. it "sets .root_path" do
  28. assert_equal Scraper.root_path, subclass.root_path
  29. end
  30. it "duplicates .initial_paths" do
  31. stub(Scraper).initial_paths { ['path'] }
  32. assert_equal Scraper.initial_paths, subclass.initial_paths
  33. refute_same Scraper.initial_paths, subclass.initial_paths
  34. end
  35. it "duplicates .options" do
  36. stub(Scraper).options { { test: [] } }
  37. assert_equal Scraper.options, subclass.options
  38. refute_same Scraper.options, subclass.options
  39. refute_same Scraper.options[:test], subclass.options[:test]
  40. end
  41. it "duplicates .html_filters" do
  42. assert_equal Scraper.html_filters, subclass.html_filters
  43. refute_same Scraper.html_filters, subclass.html_filters
  44. end
  45. it "duplicates .text_filters" do
  46. assert_equal Scraper.text_filters, subclass.text_filters
  47. refute_same Scraper.text_filters, subclass.text_filters
  48. end
  49. end
  50. describe ".filters" do
  51. it "returns the union of .html_filters and .text_filters" do
  52. stub(Scraper.html_filters).to_a { [1] }
  53. stub(Scraper.text_filters).to_a { [2] }
  54. assert_equal [1, 2], Scraper.filters
  55. end
  56. end
  57. describe "#root_path?" do
  58. it "returns false when .root_path is blank" do
  59. stub(Scraper).root_path { '' }
  60. refute scraper.root_path?
  61. end
  62. it "returns false when .root_path is '/'" do
  63. stub(Scraper).root_path { '/' }
  64. refute scraper.root_path?
  65. end
  66. it "returns true when .root_path is '/path'" do
  67. stub(Scraper).root_path { '/path' }
  68. assert scraper.root_path?
  69. end
  70. end
  71. describe "#root_url" do
  72. let :root_url do
  73. scraper.root_url
  74. end
  75. context "when #root_path? is false" do
  76. before do
  77. stub(scraper).root_path? { false }
  78. end
  79. it "returns a memoized Docs::URL" do
  80. assert_instance_of Docs::URL, root_url
  81. assert_same root_url, scraper.root_url
  82. end
  83. it "returns the normalized .base_url" do
  84. stub(Scraper).base_url { 'http://example.com' }
  85. assert_equal 'http://example.com/', root_url.to_s
  86. end
  87. end
  88. context "when #root_path? is true" do
  89. before do
  90. stub(scraper).root_path? { true }
  91. end
  92. it "returns a memoized Docs::URL" do
  93. assert_instance_of Docs::URL, root_url
  94. assert_same root_url, scraper.root_url
  95. end
  96. it "returns .base_url + .root_path" do
  97. stub(Scraper).base_url { 'http://example.com/path/' }
  98. stub(Scraper).root_path { '/root' }
  99. assert_equal 'http://example.com/path/root', root_url.to_s
  100. end
  101. end
  102. end
  103. describe "#initial_urls" do
  104. let :initial_urls do
  105. scraper.initial_urls
  106. end
  107. it "returns a frozen, memoized Array" do
  108. assert_instance_of Array, initial_urls
  109. assert initial_urls.frozen?
  110. assert_same initial_urls, scraper.initial_urls
  111. end
  112. it "includes the #root_url" do
  113. assert_includes initial_urls, scraper.root_url.to_s
  114. end
  115. it "includes the .initial_paths converted to urls" do
  116. stub(Scraper).base_url { 'http://example.com/' }
  117. stub(Scraper).initial_paths { ['one', '/two'] }
  118. assert_includes initial_urls, 'http://example.com/one'
  119. assert_includes initial_urls, 'http://example.com/two'
  120. end
  121. end
  122. describe "#build_page" do
  123. before do
  124. stub(scraper).handle_response
  125. end
  126. it "requires a path" do
  127. assert_raises ArgumentError do
  128. scraper.build_page
  129. end
  130. end
  131. context "with a blank path" do
  132. it "requests the root url" do
  133. mock(scraper).request_one(scraper.root_url.to_s)
  134. scraper.build_page ''
  135. end
  136. end
  137. context "with '/'" do
  138. it "requests the root url" do
  139. mock(scraper).request_one(scraper.root_url.to_s)
  140. scraper.build_page '/'
  141. end
  142. end
  143. context "with '/file'" do
  144. it "requests 'example.com/file' when the base url is 'example.com" do
  145. stub(Scraper).base_url { 'http://example.com' }
  146. mock(scraper).request_one 'http://example.com/file'
  147. scraper.build_page '/file'
  148. end
  149. it "requests 'example.com/file' when the base url is 'example.com/" do
  150. stub(Scraper).base_url { 'http://example.com/' }
  151. mock(scraper).request_one 'http://example.com/file'
  152. scraper.build_page '/file'
  153. end
  154. end
  155. it "returns the processed response" do
  156. stub(scraper).request_one { response }
  157. mock(scraper).handle_response(response) { 'test' }
  158. assert_equal 'test', scraper.build_page('')
  159. end
  160. it "yields the processed response" do
  161. stub(scraper).request_one { response }
  162. stub(scraper).handle_response(response) { 'test' }
  163. scraper.build_page('') { |arg| @arg = arg }
  164. assert @arg
  165. assert_equal 'test', @arg
  166. end
  167. end
  168. describe "#build_pages" do
  169. let :block do
  170. Proc.new {}
  171. end
  172. let :processed_response do
  173. Hash.new
  174. end
  175. it "requests the #initial_urls" do
  176. mock(scraper).request_all(scraper.initial_urls)
  177. scraper.build_pages(&block)
  178. end
  179. it "instruments 'running'" do
  180. stub(scraper).request_all
  181. scraper.build_pages(&block)
  182. assert scraper.last_instrumentation
  183. assert_equal 'running.scraper', scraper.last_instrumentation[:event]
  184. assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
  185. end
  186. context "when the response is processable" do
  187. before do
  188. stub(scraper).request_all do |urls, block|
  189. urls.each { |url| @next_urls ||= block.call(response) }
  190. end
  191. stub(scraper).handle_response(response) { processed_response }
  192. end
  193. it "yields the processed response" do
  194. scraper.build_pages { |arg| @arg = arg }
  195. assert_same processed_response, @arg
  196. end
  197. context "when :internal_urls is empty" do
  198. before do
  199. processed_response[:internal_urls] = []
  200. end
  201. it "requests nothing more" do
  202. scraper.build_pages(&block)
  203. assert_nil @next_urls
  204. end
  205. it "doesn't instrument 'queued'" do
  206. scraper.build_pages(&block)
  207. refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
  208. end
  209. end
  210. context "when :internal_urls isn't empty" do
  211. let :internal_urls do
  212. ['Url']
  213. end
  214. before do
  215. processed_response[:internal_urls] = internal_urls
  216. end
  217. it "requests the urls" do
  218. scraper.build_pages(&block)
  219. assert_equal internal_urls, @next_urls
  220. end
  221. it "doesn't request the same url twice irrespective of case" do
  222. processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
  223. scraper.build_pages(&block)
  224. assert_empty @next_urls
  225. end
  226. it "instruments 'queued'" do
  227. scraper.build_pages(&block)
  228. assert scraper.last_instrumentation
  229. assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
  230. assert_equal internal_urls, scraper.last_instrumentation[:payload][:urls]
  231. end
  232. end
  233. end
  234. context "when the response isn't processable" do
  235. it "doesn't yield" do
  236. stub(scraper).request_all.yields(response)
  237. stub(scraper).handle_response(response) { nil }
  238. scraper.build_pages { @yield = true }
  239. refute @yield
  240. end
  241. end
  242. end
  243. describe "#options" do
  244. let :options do
  245. Hash.new
  246. end
  247. let :result do
  248. scraper.options
  249. end
  250. before do
  251. stub(Scraper).options { options }
  252. end
  253. it "returns a frozen, memoized Hash" do
  254. assert_instance_of Hash, result
  255. assert result.frozen?
  256. assert_same result, scraper.options
  257. end
  258. it "includes .options" do
  259. options[:test] = true
  260. assert result[:test]
  261. end
  262. it "includes #base_url" do
  263. assert_equal scraper.base_url, result[:base_url]
  264. end
  265. it "includes #root_url" do
  266. assert_equal scraper.root_url, result[:root_url]
  267. end
  268. it "includes #root_path" do
  269. assert_equal '/root', result[:root_path]
  270. end
  271. it "includes #initial_paths" do
  272. assert_equal ['/initial'], result[:initial_paths]
  273. end
  274. it "adds #initial_paths to :only when it is an array" do
  275. options[:only] = ['/path']
  276. assert_includes result[:only], options[:only].first
  277. assert_includes result[:only], '/initial'
  278. end
  279. it "adds #initial_paths to :only when :only_patterns is an array" do
  280. options[:only_patterns] = []
  281. assert_includes result[:only], '/initial'
  282. end
  283. it "doesn't modify :only in place" do
  284. options[:only] = []
  285. result
  286. assert_empty options[:only]
  287. end
  288. context "when #root_path? is false" do
  289. before do
  290. stub(scraper).root_path? { false }
  291. end
  292. it "doesn't modify :skip" do
  293. options[:skip] = []
  294. assert_equal options[:skip], result[:skip]
  295. end
  296. it "adds '' and '/' to :only when it is an array" do
  297. options[:only] = ['/path']
  298. assert_includes result[:only], options[:only].first
  299. assert_includes result[:only], ''
  300. assert_includes result[:only], '/'
  301. end
  302. it "adds '' and '/' to :only when :only_patterns is an array" do
  303. options[:only_patterns] = []
  304. assert_includes result[:only], ''
  305. assert_includes result[:only], '/'
  306. end
  307. it "doesn't modify :only in place" do
  308. options[:only] = []
  309. result
  310. assert_empty options[:only]
  311. end
  312. end
  313. context "when #root_path? is true" do
  314. before do
  315. stub(scraper).root_path? { true }
  316. end
  317. it "adds '' and '/' to :skip when it is nil" do
  318. assert_includes result[:skip], ''
  319. assert_includes result[:skip], '/'
  320. end
  321. it "adds '' and '/' to :skip when it is an array" do
  322. options[:skip] = ['/path']
  323. assert_includes result[:skip], options[:skip].first
  324. assert_includes result[:skip], ''
  325. assert_includes result[:skip], '/'
  326. end
  327. it "doesn't modify :skip in place" do
  328. options[:skip] = []
  329. result
  330. assert_empty options[:skip]
  331. end
  332. it "adds #root_path to :only when it is an array" do
  333. options[:only] = ['/path']
  334. assert_includes result[:only], options[:only].first
  335. assert_includes result[:only], '/root'
  336. end
  337. it "adds #root_path to :only when :only_patterns is an array" do
  338. options[:only_patterns] = []
  339. assert_includes result[:only], '/root'
  340. end
  341. end
  342. end
  343. describe "#handle_response" do
  344. let :result do
  345. scraper.send :handle_response, response
  346. end
  347. context "when the response is processable" do
  348. before do
  349. stub(scraper).process_response?(response) { true }
  350. end
  351. it "runs the pipeline" do
  352. mock(scraper.pipeline).call.with_any_args
  353. result
  354. end
  355. it "returns the result" do
  356. stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
  357. assert result[:test]
  358. end
  359. it "instruments 'process_response'" do
  360. result
  361. assert scraper.last_instrumentation
  362. assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
  363. assert_equal response, scraper.last_instrumentation[:payload][:response]
  364. end
  365. context "the pipeline document" do
  366. it "is the parsed response body" do
  367. response.body = 'body'
  368. stub(scraper.pipeline).call { |arg| @arg = arg }
  369. mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
  370. result
  371. assert_equal 'html', @arg
  372. end
  373. end
  374. context "the pipeline context" do
  375. let :context do
  376. stub(scraper.pipeline).call { |_, arg| @arg = arg }
  377. result
  378. @arg
  379. end
  380. it "includes #options" do
  381. stub(scraper).options { { test: true } }
  382. assert context[:test]
  383. end
  384. it "includes the response url" do
  385. response.url = 'url'
  386. assert_equal 'url', context[:url]
  387. end
  388. end
  389. end
  390. context "when the response isn't processable" do
  391. before do
  392. stub(scraper).process_response?(response) { false }
  393. end
  394. it "doesn't run the pipeline" do
  395. dont_allow(scraper.pipeline).call
  396. result
  397. end
  398. it "returns nil" do
  399. assert_nil result
  400. end
  401. it "instruments 'ignore_response'" do
  402. result
  403. assert scraper.last_instrumentation
  404. assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
  405. assert_equal response, scraper.last_instrumentation[:payload][:response]
  406. end
  407. end
  408. end
  409. describe "#pipeline" do
  410. let :pipeline do
  411. scraper.pipeline
  412. end
  413. it "returns a memoized HTML::Pipeline" do
  414. assert_instance_of ::HTML::Pipeline, pipeline
  415. assert_same pipeline, scraper.pipeline
  416. end
  417. it "returns a pipeline with the filters stored in .filters" do
  418. stub(Scraper).filters { [1] }
  419. assert_equal Scraper.filters, pipeline.filters
  420. end
  421. it "returns a pipeline with Docs as instrumentation service" do
  422. assert_equal Docs, pipeline.instrumentation_service
  423. end
  424. end
  425. end