scraper.rb 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. require 'set'
  2. module Docs
  3. class Scraper < Doc
  4. class << self
  5. attr_accessor :base_url, :root_path, :initial_paths, :options, :html_filters, :text_filters, :stubs
  6. def inherited(subclass)
  7. super
  8. subclass.class_eval do
  9. extend AutoloadHelper
  10. autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  11. end
  12. subclass.base_url = base_url
  13. subclass.root_path = root_path
  14. subclass.initial_paths = initial_paths.dup
  15. subclass.options = options.deep_dup
  16. subclass.html_filters = html_filters.inheritable_copy
  17. subclass.text_filters = text_filters.inheritable_copy
  18. subclass.stubs = stubs.dup
  19. end
  20. def filters
  21. html_filters.to_a + text_filters.to_a
  22. end
  23. def stub(path, &block)
  24. @stubs[path] = block
  25. @stubs
  26. end
  27. end
  28. include Instrumentable
  29. self.initial_paths = []
  30. self.options = {}
  31. self.stubs = {}
  32. self.html_filters = FilterStack.new
  33. self.text_filters = FilterStack.new
  34. html_filters.push 'apply_base_url', 'container', 'clean_html', 'normalize_urls', 'internal_urls', 'normalize_paths', 'parse_cf_email'
  35. text_filters.push 'images' # ensure the images filter runs after all html filters
  36. text_filters.push 'inner_html', 'clean_text', 'attribution'
  37. def initialize
  38. super
  39. initialize_stubs
  40. end
  41. def initialize_stubs
  42. self.class.stubs.each do |path, block|
  43. Typhoeus.stub(url_for(path)).and_return do
  44. Typhoeus::Response.new \
  45. effective_url: url_for(path),
  46. code: 200,
  47. headers: { 'Content-Type' => 'text/html' },
  48. body: self.instance_exec(&block)
  49. end
  50. end
  51. end
  52. def build_page(path)
  53. response = request_one url_for(path)
  54. result = handle_response(response)
  55. yield result if block_given?
  56. result
  57. end
  58. def build_pages
  59. history = Set.new initial_urls.map(&:downcase)
  60. instrument 'running.scraper', urls: initial_urls
  61. request_all initial_urls do |response|
  62. next unless data = handle_response(response)
  63. yield data
  64. next unless data[:internal_urls].present?
  65. next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
  66. instrument 'queued.scraper', urls: next_urls
  67. next_urls
  68. end
  69. end
  70. def base_url
  71. @base_url ||= URL.parse self.class.base_url
  72. end
  73. def root_url
  74. @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
  75. end
  76. def root_path
  77. self.class.root_path
  78. end
  79. def root_path?
  80. root_path.present? && root_path != '/'
  81. end
  82. def initial_paths
  83. self.class.initial_paths
  84. end
  85. def initial_urls
  86. @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
  87. end
  88. def pipeline
  89. @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
  90. pipeline.instrumentation_service = Docs
  91. end
  92. end
  93. def options
  94. @options ||= self.class.options.deep_dup.tap do |options|
  95. options.merge! base_url: base_url, root_url: root_url,
  96. root_path: root_path, initial_paths: initial_paths,
  97. version: self.class.version, release: self.class.release
  98. if root_path?
  99. (options[:skip] ||= []).concat ['', '/']
  100. end
  101. if options[:only] || options[:only_patterns]
  102. (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
  103. end
  104. options.merge!(additional_options)
  105. options.freeze
  106. end
  107. end
  108. def get_latest_version(options, &block)
  109. raise NotImplementedError
  110. end
  111. # Returns whether or not this scraper is outdated.
  112. #
  113. # The default implementation assumes the documentation uses a semver(-like) approach when it comes to versions.
  114. # Patch updates are ignored because there are usually little to no documentation changes in bug-fix-only releases.
  115. #
  116. # Scrapers of documentations that do not use this versioning approach should override this method.
  117. #
  118. # Examples of the default implementation:
  119. # 1 -> 2 = outdated
  120. # 1.1 -> 1.2 = outdated
  121. # 1.1.1 -> 1.1.2 = not outdated
  122. def is_outdated(scraper_version, latest_version)
  123. scraper_parts = scraper_version.split(/\./).map(&:to_i)
  124. latest_parts = latest_version.split(/\./).map(&:to_i)
  125. # Only check the first two parts, the third part is for patch updates
  126. [0, 1].each do |i|
  127. break if i >= scraper_parts.length or i >= latest_parts.length
  128. return true if latest_parts[i] > scraper_parts[i]
  129. return false if latest_parts[i] < scraper_parts[i]
  130. end
  131. false
  132. end
  133. private
  134. def request_one(url)
  135. raise NotImplementedError
  136. end
  137. def request_all(url, &block)
  138. raise NotImplementedError
  139. end
  140. def process_response?(response)
  141. raise NotImplementedError
  142. end
  143. def url_for(path)
  144. if path.empty? || path == '/'
  145. root_url.to_s
  146. else
  147. File.join(base_url.to_s, path)
  148. end
  149. end
  150. def handle_response(response)
  151. if process_response?(response)
  152. instrument 'process_response.scraper', response: response do
  153. process_response(response)
  154. end
  155. else
  156. instrument 'ignore_response.scraper', response: response
  157. end
  158. rescue => e
  159. if Docs.rescue_errors
  160. instrument 'error.doc', exception: e, url: response.url
  161. nil
  162. else
  163. raise e
  164. end
  165. end
  166. def process_response(response)
  167. data = {}
  168. html, title = parse(response)
  169. context = pipeline_context(response)
  170. context[:html_title] = title
  171. pipeline.call(html, context, data)
  172. data
  173. end
  174. def pipeline_context(response)
  175. options.merge url: response.url
  176. end
  177. def parse(response)
  178. parser = Parser.new(response.body)
  179. [parser.html, parser.title]
  180. end
  181. def with_filters(*filters)
  182. stack = FilterStack.new
  183. stack.push(*filters)
  184. pipeline.instance_variable_set :@filters, stack.to_a.freeze
  185. yield
  186. ensure
  187. @pipeline = nil
  188. end
  189. def additional_options
  190. {}
  191. end
  192. #
  193. # Utility methods for get_latest_version
  194. #
  195. def fetch(url, options, &block)
  196. headers = {}
  197. if options.key?(:github_token) and url.start_with?('https://api.github.com/')
  198. headers['Authorization'] = "token #{options[:github_token]}"
  199. end
  200. options[:logger].debug("Fetching #{url}")
  201. Request.run(url, { headers: headers }) do |response|
  202. if response.success?
  203. block.call response.body
  204. else
  205. options[:logger].error("Couldn't fetch #{url} (response code #{response.code})")
  206. block.call nil
  207. end
  208. end
  209. end
  210. def fetch_doc(url, options, &block)
  211. fetch(url, options) do |body|
  212. block.call Nokogiri::HTML.parse body, nil, 'UTF-8'
  213. end
  214. end
  215. def fetch_json(url, options, &block)
  216. fetch(url, options) do |body|
  217. json = JSON.parse(body)
  218. block.call json
  219. end
  220. end
  221. def get_npm_version(package, options, &block)
  222. fetch_json("https://registry.npmjs.com/#{package}", options) do |json|
  223. block.call json['dist-tags']['latest']
  224. end
  225. end
  226. def get_latest_github_release(owner, repo, options, &block)
  227. fetch_json("https://api.github.com/repos/#{owner}/#{repo}/releases/latest", options, &block)
  228. end
  229. def get_github_tags(owner, repo, options, &block)
  230. fetch_json("https://api.github.com/repos/#{owner}/#{repo}/tags", options, &block)
  231. end
  232. def get_github_file_contents(owner, repo, path, options, &block)
  233. fetch_json("https://api.github.com/repos/#{owner}/#{repo}/contents/#{path}", options) do |json|
  234. block.call(Base64.decode64(json['content']))
  235. end
  236. end
  237. module FixInternalUrlsBehavior
  238. def self.included(base)
  239. base.extend ClassMethods
  240. end
  241. def self.prepended(base)
  242. class << base
  243. prepend ClassMethods
  244. end
  245. end
  246. module ClassMethods
  247. def internal_urls
  248. @internal_urls
  249. end
  250. def store_pages(store)
  251. instrument 'info.doc', msg: 'Building internal urls...'
  252. with_internal_urls do
  253. instrument 'info.doc', msg: 'Continuing...'
  254. super
  255. end
  256. end
  257. private
  258. def with_internal_urls
  259. @internal_urls = new.fetch_internal_urls
  260. yield
  261. ensure
  262. @internal_urls = nil
  263. end
  264. end
  265. def fetch_internal_urls
  266. result = []
  267. build_pages do |page|
  268. result << page[:subpath] if page[:entries].present?
  269. end
  270. result
  271. end
  272. def initial_urls
  273. return super unless self.class.internal_urls
  274. @initial_urls ||= self.class.internal_urls.map(&method(:url_for)).freeze
  275. end
  276. private
  277. def additional_options
  278. if self.class.internal_urls
  279. super.merge! \
  280. only: self.class.internal_urls.to_set,
  281. only_patterns: nil,
  282. skip: nil,
  283. skip_patterns: nil,
  284. skip_links: nil,
  285. fixed_internal_urls: true
  286. else
  287. super
  288. end
  289. end
  290. def process_response(response)
  291. super.merge! response_url: response.url
  292. end
  293. end
  294. end
  295. end