1
0

scraper_test.rb 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. require 'test_helper'
  2. require 'docs'
  3. class DocsScraperTest < MiniTest::Spec
  4. class Scraper < Docs::Scraper
  5. self.type = 'scraper'
  6. self.base_url = 'http://example.com/'
  7. self.root_path = '/root'
  8. self.initial_paths = ['/initial']
  9. self.html_filters = Docs::FilterStack.new
  10. self.text_filters = Docs::FilterStack.new
  11. end
  12. let :scraper do
  13. Scraper.new.tap do |scraper|
  14. scraper.extend FakeInstrumentation
  15. end
  16. end
  17. let :response do
  18. Struct.new(:body, :url).new
  19. end
  20. let :processed_response do
  21. Hash.new
  22. end
  23. describe ".inherited" do
  24. let :subclass do
  25. Class.new Scraper
  26. end
  27. it "sets .type" do
  28. assert_equal Scraper.type, subclass.type
  29. end
  30. it "sets .root_path" do
  31. assert_equal Scraper.root_path, subclass.root_path
  32. end
  33. it "duplicates .initial_paths" do
  34. stub(Scraper).initial_paths { ['path'] }
  35. assert_equal Scraper.initial_paths, subclass.initial_paths
  36. refute_same Scraper.initial_paths, subclass.initial_paths
  37. end
  38. it "duplicates .options" do
  39. stub(Scraper).options { { test: [] } }
  40. assert_equal Scraper.options, subclass.options
  41. refute_same Scraper.options, subclass.options
  42. refute_same Scraper.options[:test], subclass.options[:test]
  43. end
  44. it "duplicates .html_filters" do
  45. assert_equal Scraper.html_filters, subclass.html_filters
  46. refute_same Scraper.html_filters, subclass.html_filters
  47. end
  48. it "duplicates .text_filters" do
  49. assert_equal Scraper.text_filters, subclass.text_filters
  50. refute_same Scraper.text_filters, subclass.text_filters
  51. end
  52. end
  53. describe ".filters" do
  54. it "returns the union of .html_filters and .text_filters" do
  55. stub(Scraper.html_filters).to_a { [1] }
  56. stub(Scraper.text_filters).to_a { [2] }
  57. assert_equal [1, 2], Scraper.filters
  58. end
  59. end
  60. describe "#root_path?" do
  61. it "returns false when .root_path is blank" do
  62. stub(Scraper).root_path { '' }
  63. refute scraper.root_path?
  64. end
  65. it "returns false when .root_path is '/'" do
  66. stub(Scraper).root_path { '/' }
  67. refute scraper.root_path?
  68. end
  69. it "returns true when .root_path is '/path'" do
  70. stub(Scraper).root_path { '/path' }
  71. assert scraper.root_path?
  72. end
  73. end
  74. describe "#root_url" do
  75. context "when #root_path? is false" do
  76. before do
  77. stub(scraper).root_path? { false }
  78. end
  79. it "returns a Docs::URL" do
  80. assert_instance_of Docs::URL, scraper.root_url
  81. end
  82. it "returns the normalized base url" do
  83. stub(Scraper).base_url { 'http://example.com' }
  84. assert_equal 'http://example.com/', scraper.root_url.to_s
  85. end
  86. end
  87. context "when .root_path isn't blank" do
  88. before do
  89. stub(scraper).root_path? { true }
  90. end
  91. it "returns a Docs::URL" do
  92. assert_instance_of Docs::URL, scraper.root_url
  93. end
  94. it "returns base url + root path" do
  95. stub(Scraper).base_url { 'http://example.com/path/' }
  96. assert_equal 'http://example.com/path/root', scraper.root_url.to_s
  97. end
  98. end
  99. end
  100. describe "#initial_urls" do
  101. let :initial_urls do
  102. scraper.initial_urls
  103. end
  104. it "returns a frozen, memoized Array" do
  105. assert_instance_of Array, initial_urls
  106. assert initial_urls.frozen?
  107. assert_same initial_urls, scraper.initial_urls
  108. end
  109. it "includes the #root_url" do
  110. assert_includes initial_urls, scraper.root_url.to_s
  111. end
  112. it "includes the .initial_paths converted to urls" do
  113. stub(Scraper).base_url { 'http://example.com/' }
  114. stub(Scraper).initial_paths { ['one', '/two'] }
  115. assert_includes initial_urls, 'http://example.com/one'
  116. assert_includes initial_urls, 'http://example.com/two'
  117. end
  118. end
  119. describe "#build_page" do
  120. before do
  121. stub(scraper).handle_response
  122. end
  123. it "requires a path" do
  124. assert_raises ArgumentError do
  125. scraper.build_page
  126. end
  127. end
  128. context "with a blank path" do
  129. it "requests the root url" do
  130. mock(scraper).request_one(scraper.root_url.to_s)
  131. scraper.build_page ''
  132. end
  133. end
  134. context "with '/'" do
  135. it "requests the root url" do
  136. mock(scraper).request_one(scraper.root_url.to_s)
  137. scraper.build_page '/'
  138. end
  139. end
  140. context "with '/file'" do
  141. it "requests 'example.com/file' when the base url is 'example.com" do
  142. stub(Scraper).base_url { 'http://example.com' }
  143. mock(scraper).request_one 'http://example.com/file'
  144. scraper.build_page '/file'
  145. end
  146. it "requests 'example.com/file' when the base url is 'example.com/" do
  147. stub(Scraper).base_url { 'http://example.com/' }
  148. mock(scraper).request_one 'http://example.com/file'
  149. scraper.build_page '/file'
  150. end
  151. end
  152. it "returns the processed response" do
  153. stub(scraper).request_one { response }
  154. mock(scraper).handle_response(response) { 'test' }
  155. assert_equal 'test', scraper.build_page('')
  156. end
  157. it "yields the processed response" do
  158. stub(scraper).request_one { response }
  159. stub(scraper).handle_response(response) { 'test' }
  160. scraper.build_page('') { |arg| @arg = arg }
  161. assert @arg
  162. assert_equal 'test', @arg
  163. end
  164. end
  165. describe "#build_pages" do
  166. let :block do
  167. Proc.new {}
  168. end
  169. it "requests the #initial_urls" do
  170. mock(scraper).request_all(scraper.initial_urls)
  171. scraper.build_pages(&block)
  172. end
  173. it "instruments 'running'" do
  174. stub(scraper).request_all
  175. scraper.build_pages(&block)
  176. assert scraper.last_instrumentation
  177. assert_equal 'running.scraper', scraper.last_instrumentation[:event]
  178. assert_equal scraper.initial_urls, scraper.last_instrumentation[:payload][:urls]
  179. end
  180. context "when the response is processable" do
  181. before do
  182. stub(scraper).request_all do |urls, block|
  183. urls.each { |url| @next_urls ||= block.call(response) }
  184. end
  185. stub(scraper).handle_response(response) { processed_response }
  186. end
  187. it "yields the processed response" do
  188. scraper.build_pages { |arg| @arg = arg }
  189. assert @arg
  190. assert_equal processed_response, @arg
  191. end
  192. context "when response[:internal_urls] is empty" do
  193. before do
  194. processed_response[:internal_urls] = []
  195. end
  196. it "requests nothing more" do
  197. scraper.build_pages(&block)
  198. assert_nil @next_urls
  199. end
  200. it "doesn't instrument 'queued'" do
  201. scraper.build_pages(&block)
  202. refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
  203. end
  204. end
  205. context "when response[:internal_urls] isn't empty" do
  206. let :urls do
  207. ['Url']
  208. end
  209. before do
  210. processed_response[:internal_urls] = urls
  211. end
  212. it "requests the urls" do
  213. scraper.build_pages(&block)
  214. assert_equal urls, @next_urls
  215. end
  216. it "doesn't request the same url twice irrespective of case" do
  217. stub(Scraper).root_path { 'PATH' }
  218. processed_response[:internal_urls] = scraper.initial_urls.map(&:swapcase)
  219. scraper.build_pages(&block)
  220. assert_empty @next_urls
  221. end
  222. it "instruments 'queued'" do
  223. scraper.build_pages(&block)
  224. assert scraper.last_instrumentation
  225. assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
  226. assert_equal urls, scraper.last_instrumentation[:payload][:urls]
  227. end
  228. end
  229. end
  230. context "when the response isn't processable" do
  231. it "doesn't yield" do
  232. stub(scraper).request_all.yields(response)
  233. stub(scraper).handle_response(response) { nil }
  234. scraper.build_pages { @yield = true }
  235. refute @yield
  236. end
  237. end
  238. end
  239. describe "#options" do
  240. let :options do
  241. scraper.options
  242. end
  243. it "returns a frozen, memoized Hash" do
  244. assert_instance_of Hash, options
  245. assert options.frozen?
  246. assert_same options, scraper.options
  247. end
  248. it "includes .options" do
  249. stub(Scraper).options { { test: true } }
  250. assert options[:test]
  251. end
  252. it "includes #base_url" do
  253. assert_equal scraper.base_url, options[:base_url]
  254. end
  255. it "includes #root_url" do
  256. assert_equal scraper.root_url, options[:root_url]
  257. end
  258. it "includes .root_path" do
  259. assert_equal '/root', options[:root_path]
  260. end
  261. it "includes .initial_paths" do
  262. assert_equal ['/initial'], options[:initial_paths]
  263. end
  264. context "when #root_path? is false" do
  265. before do
  266. stub(scraper).root_path? { false }
  267. end
  268. it "doesn't modify :skip" do
  269. assert_nil options[:skip]
  270. end
  271. context "and :only is an array" do
  272. before do
  273. stub(Scraper).options { { only: ['/path'] } }
  274. end
  275. it "adds ['', '/']" do
  276. assert_includes options[:only], '/path'
  277. assert_includes options[:only], ''
  278. assert_includes options[:only], '/'
  279. end
  280. it "adds .initial_paths" do
  281. assert_includes options[:only], '/initial'
  282. end
  283. it "doesn't modify the array in place" do
  284. assert_equal ['/path'], Scraper.options[:only]
  285. end
  286. end
  287. context "and :only_patterns is an array" do
  288. before do
  289. stub(Scraper).options { { only_patterns: [] } }
  290. end
  291. it "adds ['', '/'] to :only" do
  292. assert_includes options[:only], ''
  293. assert_includes options[:only], '/'
  294. end
  295. it "adds .initial_paths to :only" do
  296. assert_includes options[:only], '/initial'
  297. end
  298. end
  299. end
  300. context "when #root_path? is true" do
  301. before do
  302. stub(scraper).root_path? { true }
  303. end
  304. context "and :skip is nil" do
  305. it "assigns it ['', '/']" do
  306. assert_equal ['', '/'], options[:skip]
  307. end
  308. end
  309. context "and :skip is an array" do
  310. before do
  311. stub(Scraper).options { { skip: ['/path'] } }
  312. end
  313. it "adds ['', '/']" do
  314. assert_includes options[:skip], '/path'
  315. assert_includes options[:skip], ''
  316. assert_includes options[:skip], '/'
  317. end
  318. it "doesn't modify the array in place" do
  319. assert_equal ['/path'], Scraper.options[:skip]
  320. end
  321. end
  322. context "and :only is an array" do
  323. it "adds .root_path" do
  324. stub(Scraper).options { { only: [] } }
  325. assert_includes options[:only], '/root'
  326. end
  327. end
  328. context "and :only_patterns is an array" do
  329. it "adds .root_path to :only" do
  330. stub(Scraper).options { { only_patterns: [] } }
  331. assert_includes options[:only], '/root'
  332. end
  333. end
  334. end
  335. end
  336. describe "#handle_response" do
  337. let :result do
  338. scraper.send :handle_response, response
  339. end
  340. context "when the response is processable" do
  341. before do
  342. stub(scraper).process_response?(response) { true }
  343. end
  344. it "runs the pipeline" do
  345. mock(scraper.pipeline).call.with_any_args
  346. result
  347. end
  348. it "returns the result" do
  349. stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
  350. assert result[:test]
  351. end
  352. it "instruments 'process_response'" do
  353. result
  354. assert scraper.last_instrumentation
  355. assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
  356. assert_equal response, scraper.last_instrumentation[:payload][:response]
  357. end
  358. context "the pipeline document" do
  359. it "is the parsed response body" do
  360. response.body = 'body'
  361. stub(scraper.pipeline).call { |arg| @arg = arg }
  362. mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
  363. result
  364. assert_equal 'html', @arg
  365. end
  366. end
  367. context "the pipeline context" do
  368. let :context do
  369. stub(scraper.pipeline).call { |_, arg| @arg = arg }
  370. result
  371. @arg
  372. end
  373. it "includes #options" do
  374. stub(scraper).options { { test: true } }
  375. assert context[:test]
  376. end
  377. it "includes the response url" do
  378. response.url = 'url'
  379. assert_equal 'url', context[:url]
  380. end
  381. end
  382. end
  383. context "when the response isn't processable" do
  384. before do
  385. stub(scraper).process_response?(response) { false }
  386. end
  387. it "doesn't run the pipeline" do
  388. dont_allow(scraper.pipeline).call
  389. result
  390. end
  391. it "returns nil" do
  392. assert_nil result
  393. end
  394. it "instruments 'ignore_response'" do
  395. result
  396. assert scraper.last_instrumentation
  397. assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
  398. assert_equal response, scraper.last_instrumentation[:payload][:response]
  399. end
  400. end
  401. end
  402. describe "#pipeline" do
  403. it "returns an HTML::Pipeline with .filters" do
  404. stub(Scraper).filters { [1] }
  405. assert_instance_of ::HTML::Pipeline, scraper.pipeline
  406. assert_equal Scraper.filters, scraper.pipeline.filters
  407. end
  408. it "is memoized" do
  409. assert_same scraper.pipeline, scraper.pipeline
  410. end
  411. it "assigns Docs as the pipeline's instrumentation service" do
  412. assert_equal Docs, scraper.pipeline.instrumentation_service
  413. end
  414. end
  415. end