scraper_test.rb 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. require 'test_helper'
  2. require 'docs'
  3. class DocsScraperTest < MiniTest::Spec
  4. class Scraper < Docs::Scraper
  5. self.type = 'scraper'
  6. self.base_url = 'http://example.com/'
  7. self.root_path = '/root'
  8. self.html_filters = Docs::FilterStack.new
  9. self.text_filters = Docs::FilterStack.new
  10. end
  11. let :scraper do
  12. Scraper.new.tap do |scraper|
  13. scraper.extend FakeInstrumentation
  14. end
  15. end
  16. let :response do
  17. Struct.new(:body, :url).new
  18. end
  19. let :processed_response do
  20. Hash.new
  21. end
  22. describe ".inherited" do
  23. let :subclass do
  24. Class.new Scraper
  25. end
  26. it "sets .type" do
  27. assert_equal Scraper.type, subclass.type
  28. end
  29. it "sets .root_path" do
  30. assert_equal Scraper.root_path, subclass.root_path
  31. end
  32. it "duplicates .options" do
  33. stub(Scraper).options { { test: [] } }
  34. assert_equal Scraper.options, subclass.options
  35. refute_same Scraper.options, subclass.options
  36. refute_same Scraper.options[:test], subclass.options[:test]
  37. end
  38. it "duplicates .html_filters" do
  39. assert_equal Scraper.html_filters, subclass.html_filters
  40. refute_same Scraper.html_filters, subclass.html_filters
  41. end
  42. it "duplicates .text_filters" do
  43. assert_equal Scraper.text_filters, subclass.text_filters
  44. refute_same Scraper.text_filters, subclass.text_filters
  45. end
  46. end
  47. describe ".filters" do
  48. it "returns the union of .html_filters and .text_filters" do
  49. stub(Scraper.html_filters).to_a { [1] }
  50. stub(Scraper.text_filters).to_a { [2] }
  51. assert_equal [1, 2], Scraper.filters
  52. end
  53. end
  54. describe "#root_path?" do
  55. it "returns false when .root_path is blank" do
  56. stub(Scraper).root_path { '' }
  57. refute scraper.root_path?
  58. end
  59. it "returns false when .root_path is '/'" do
  60. stub(Scraper).root_path { '/' }
  61. refute scraper.root_path?
  62. end
  63. it "returns true when .root_path is '/path'" do
  64. stub(Scraper).root_path { '/path' }
  65. assert scraper.root_path?
  66. end
  67. end
  68. describe "#root_url" do
  69. context "when #root_path? is false" do
  70. before do
  71. stub(scraper).root_path? { false }
  72. end
  73. it "returns a Docs::URL" do
  74. assert_instance_of Docs::URL, scraper.root_url
  75. end
  76. it "returns the normalized base url" do
  77. stub(Scraper).base_url { 'http://example.com' }
  78. assert_equal 'http://example.com/', scraper.root_url.to_s
  79. end
  80. end
  81. context "when .root_path isn't blank" do
  82. before do
  83. stub(scraper).root_path? { true }
  84. end
  85. it "returns a Docs::URL" do
  86. assert_instance_of Docs::URL, scraper.root_url
  87. end
  88. it "returns base url + root path" do
  89. stub(Scraper).base_url { 'http://example.com/path/' }
  90. assert_equal 'http://example.com/path/root', scraper.root_url.to_s
  91. end
  92. end
  93. end
  94. describe "#build_page" do
  95. before do
  96. stub(scraper).handle_response
  97. end
  98. it "requires a path" do
  99. assert_raises ArgumentError do
  100. scraper.build_page
  101. end
  102. end
  103. context "with a blank path" do
  104. it "requests the root url" do
  105. mock(scraper).request_one(scraper.root_url.to_s)
  106. scraper.build_page ''
  107. end
  108. end
  109. context "with '/'" do
  110. it "requests the root url" do
  111. mock(scraper).request_one(scraper.root_url.to_s)
  112. scraper.build_page '/'
  113. end
  114. end
  115. context "with '/file'" do
  116. it "requests 'example.com/file' when the base url is 'example.com" do
  117. stub(Scraper).base_url { 'http://example.com' }
  118. mock(scraper).request_one 'http://example.com/file'
  119. scraper.build_page '/file'
  120. end
  121. it "requests 'example.com/file' when the base url is 'example.com/" do
  122. stub(Scraper).base_url { 'http://example.com/' }
  123. mock(scraper).request_one 'http://example.com/file'
  124. scraper.build_page '/file'
  125. end
  126. end
  127. it "returns the processed response" do
  128. stub(scraper).request_one { response }
  129. mock(scraper).handle_response(response) { 'test' }
  130. assert_equal 'test', scraper.build_page('')
  131. end
  132. it "yields the processed response" do
  133. stub(scraper).request_one { response }
  134. stub(scraper).handle_response(response) { 'test' }
  135. scraper.build_page('') { |arg| @arg = arg }
  136. assert @arg
  137. assert_equal 'test', @arg
  138. end
  139. end
  140. describe "#build_pages" do
  141. let :block do
  142. Proc.new {}
  143. end
  144. it "requests the root url" do
  145. mock(scraper).request_all(scraper.root_url.to_s)
  146. scraper.build_pages(&block)
  147. end
  148. it "instruments 'running'" do
  149. stub(scraper).request_all
  150. scraper.build_pages(&block)
  151. assert scraper.last_instrumentation
  152. assert_equal 'running.scraper', scraper.last_instrumentation[:event]
  153. assert_includes scraper.last_instrumentation[:payload][:urls], scraper.root_url.to_s
  154. end
  155. context "when the response is processable" do
  156. before do
  157. stub(scraper).request_all.with_any_args { |*args| @returned = args.last.call(response) }
  158. stub(scraper).handle_response(response) { processed_response }
  159. end
  160. it "yields the processed response" do
  161. scraper.build_pages { |arg| @arg = arg }
  162. assert @arg
  163. assert_equal processed_response, @arg
  164. end
  165. context "when response[:internal_urls] is empty" do
  166. before do
  167. processed_response[:internal_urls] = []
  168. end
  169. it "requests nothing more" do
  170. scraper.build_pages(&block)
  171. assert_nil @returned
  172. end
  173. it "doesn't instrument 'queued'" do
  174. scraper.build_pages(&block)
  175. refute_equal 'queued.scraper', scraper.last_instrumentation.try(:[], :event)
  176. end
  177. end
  178. context "when response[:internal_urls] isn't empty" do
  179. let :urls do
  180. ['Url']
  181. end
  182. before do
  183. processed_response[:internal_urls] = urls
  184. end
  185. it "requests the urls" do
  186. scraper.build_pages(&block)
  187. assert_equal urls, @returned
  188. end
  189. it "doesn't request the same url twice irrespective of case" do
  190. stub(Scraper).root_path { 'PATH' }
  191. processed_response[:internal_urls] = [scraper.root_url.to_s.swapcase]
  192. scraper.build_pages(&block)
  193. assert_empty @returned
  194. end
  195. it "instruments 'queued'" do
  196. scraper.build_pages(&block)
  197. assert scraper.last_instrumentation
  198. assert_equal 'queued.scraper', scraper.last_instrumentation[:event]
  199. assert_equal urls, scraper.last_instrumentation[:payload][:urls]
  200. end
  201. end
  202. end
  203. context "when the response isn't processable" do
  204. it "doesn't yield" do
  205. stub(scraper).request_all.yields(response)
  206. stub(scraper).handle_response(response) { nil }
  207. scraper.build_pages { @yield = true }
  208. refute @yield
  209. end
  210. end
  211. end
  212. describe "#options" do
  213. let :options do
  214. scraper.options
  215. end
  216. it "returns a frozen, memoized Hash" do
  217. assert_instance_of Hash, options
  218. assert options.frozen?
  219. assert_same options, scraper.options
  220. end
  221. it "includes .options" do
  222. stub(Scraper).options { { test: true } }
  223. assert options[:test]
  224. end
  225. it "includes #base_url" do
  226. assert_equal scraper.base_url, options[:base_url]
  227. end
  228. it "includes #root_url" do
  229. assert_equal scraper.root_url, options[:root_url]
  230. end
  231. it "includes .root_path" do
  232. assert_equal '/root', options[:root_path]
  233. end
  234. context "when #root_path? is false" do
  235. before do
  236. stub(scraper).root_path? { false }
  237. end
  238. it "doesn't modify :skip" do
  239. assert_nil options[:skip]
  240. end
  241. context "and :only is an array" do
  242. before do
  243. stub(Scraper).options { { only: ['/path'] } }
  244. end
  245. it "adds ['', '/']" do
  246. assert_includes options[:only], '/path'
  247. assert_includes options[:only], ''
  248. assert_includes options[:only], '/'
  249. end
  250. it "doesn't modify the array in place" do
  251. assert_equal ['/path'], Scraper.options[:only]
  252. end
  253. end
  254. context "and :only_patterns is an array" do
  255. it "assigns ['', '/'] to :only" do
  256. stub(Scraper).options { { only_patterns: [] } }
  257. assert_equal ['', '/'], options[:only]
  258. end
  259. end
  260. end
  261. context "when #root_path? is true" do
  262. before do
  263. stub(scraper).root_path? { true }
  264. end
  265. context "and :skip is nil" do
  266. it "assigns it ['', '/']" do
  267. assert_equal ['', '/'], options[:skip]
  268. end
  269. end
  270. context "and :skip is an array" do
  271. before do
  272. stub(Scraper).options { { skip: ['/path'] } }
  273. end
  274. it "adds ['', '/']" do
  275. assert_includes options[:skip], '/path'
  276. assert_includes options[:skip], ''
  277. assert_includes options[:skip], '/'
  278. end
  279. it "doesn't modify the array in place" do
  280. assert_equal ['/path'], Scraper.options[:skip]
  281. end
  282. end
  283. context "and :only is an array" do
  284. it "adds .root_path" do
  285. stub(Scraper).options { { only: [] } }
  286. assert_includes options[:only], '/root'
  287. end
  288. end
  289. context "and :only_patterns is an array" do
  290. it "assigns [.root_path] to :only" do
  291. stub(Scraper).options { { only_patterns: [] } }
  292. assert_equal ['/root'], options[:only]
  293. end
  294. end
  295. end
  296. end
  297. describe "#handle_response" do
  298. let :result do
  299. scraper.send :handle_response, response
  300. end
  301. context "when the response is processable" do
  302. before do
  303. stub(scraper).process_response?(response) { true }
  304. end
  305. it "runs the pipeline" do
  306. mock(scraper.pipeline).call.with_any_args
  307. result
  308. end
  309. it "returns the result" do
  310. stub(scraper.pipeline).call { |_, _, result| result[:test] = true }
  311. assert result[:test]
  312. end
  313. it "instruments 'process_response'" do
  314. result
  315. assert scraper.last_instrumentation
  316. assert_equal 'process_response.scraper', scraper.last_instrumentation[:event]
  317. assert_equal response, scraper.last_instrumentation[:payload][:response]
  318. end
  319. context "the pipeline document" do
  320. it "is the parsed response body" do
  321. response.body = 'body'
  322. stub(scraper.pipeline).call { |arg| @arg = arg }
  323. mock.proxy(Docs::Parser).new('body') { |parser| stub(parser).html { 'html' } }
  324. result
  325. assert_equal 'html', @arg
  326. end
  327. end
  328. context "the pipeline context" do
  329. let :context do
  330. stub(scraper.pipeline).call { |_, arg| @arg = arg }
  331. result
  332. @arg
  333. end
  334. it "includes #options" do
  335. stub(scraper).options { { test: true } }
  336. assert context[:test]
  337. end
  338. it "includes the response url" do
  339. response.url = 'url'
  340. assert_equal 'url', context[:url]
  341. end
  342. end
  343. end
  344. context "when the response isn't processable" do
  345. before do
  346. stub(scraper).process_response?(response) { false }
  347. end
  348. it "doesn't run the pipeline" do
  349. dont_allow(scraper.pipeline).call
  350. result
  351. end
  352. it "returns nil" do
  353. assert_nil result
  354. end
  355. it "instruments 'ignore_response'" do
  356. result
  357. assert scraper.last_instrumentation
  358. assert_equal 'ignore_response.scraper', scraper.last_instrumentation[:event]
  359. assert_equal response, scraper.last_instrumentation[:payload][:response]
  360. end
  361. end
  362. end
  363. describe "#pipeline" do
  364. it "returns an HTML::Pipeline with .filters" do
  365. stub(Scraper).filters { [1] }
  366. assert_instance_of ::HTML::Pipeline, scraper.pipeline
  367. assert_equal Scraper.filters, scraper.pipeline.filters
  368. end
  369. it "is memoized" do
  370. assert_same scraper.pipeline, scraper.pipeline
  371. end
  372. it "assigns Docs as the pipeline's instrumentation service" do
  373. assert_equal Docs, scraper.pipeline.instrumentation_service
  374. end
  375. end
  376. end