rust.rb 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. # frozen_string_literal: true
  2. module Docs
  3. class Rust < UrlScraper
  4. self.type = 'rust'
  5. self.release = '1.91.1'
  6. self.base_url = 'https://doc.rust-lang.org/'
  7. self.root_path = 'book/index.html'
  8. self.initial_paths = %w(
  9. reference/introduction.html
  10. std/index.html
  11. error_codes/error-index.html)
  12. self.links = {
  13. home: 'https://www.rust-lang.org/',
  14. code: 'https://github.com/rust-lang/rust'
  15. }
  16. html_filters.push 'rust/entries', 'rust/clean_html'
  17. options[:only_patterns] = [
  18. /\Abook\//,
  19. /\Areference\//,
  20. /\Acollections\//,
  21. /\Astd\//,
  22. /\Aerror_codes\//, ]
  23. options[:skip] = %w(book/README.html book/ffi.html)
  24. options[:skip_patterns] = [/(?<!\.html)\z/, /\/print\.html/, /\Abook\/second-edition\//]
  25. options[:fix_urls] = ->(url) do
  26. url.sub! %r{(#{Rust.base_url}.+/)\z}, '\1index.html'
  27. url.sub! "#{Rust.base_url}nightly/", Rust.base_url
  28. url.sub! '/unicode/u_str', '/unicode/str/'
  29. url.sub! '/std/std/', '/std/'
  30. url
  31. end
  32. options[:attribution] = <<-HTML
  33. &copy; 2010 The Rust Project Developers<br>
  34. Licensed under the Apache License, Version 2.0 or the MIT license, at your option.
  35. HTML
  36. def get_latest_version(opts)
  37. doc = fetch_doc('https://www.rust-lang.org/', opts)
  38. label = doc.at_css('.button-download + p > a').content
  39. label.sub(/Version /, '')
  40. end
  41. private
  42. REDIRECT_RGX = /http-equiv="refresh"/i
  43. NOT_FOUND_RGX = /<title>Not Found<\/title>/
  44. def process_response?(response)
  45. !(response.body =~ REDIRECT_RGX || response.body =~ NOT_FOUND_RGX || response.body.blank?)
  46. end
  47. def parse(response) # Hook here because Nokogori removes whitespace from headings
  48. response.body.gsub! %r{<h[1-6] class="code-header">}, '<pre class="code-header">'
  49. # And the reference uses whitespace for indentation in grammar definitions
  50. response.body.gsub! %r{<div class="grammar-container">([\W\w]+?)</div>}, '<pre class="grammar-container">\1</pre>'
  51. super
  52. end
  53. end
  54. end