# encoding: UTF-8 require 'web_site/robots_txt' require 'web_site/interesting_headers' class WebSite include WebSite::RobotsTxt include WebSite::InterestingHeaders attr_reader :uri def initialize(site_url) self.url = site_url end def url=(url) @uri = URI.parse(add_trailing_slash(add_http_protocol(url))) end def url @uri.to_s end # Checks if the remote website has ssl errors def ssl_error? return false unless @uri.scheme == 'https' c = get_root_path_return_code # http://www.rubydoc.info/github/typhoeus/ethon/Ethon/Easy:return_code return ( c == :ssl_connect_error || c == :peer_failed_verification || c == :ssl_certproblem || c == :ssl_cipher || c == :ssl_cacert || c == :ssl_cacert_badfile || c == :ssl_issuer_error || c == :ssl_crl_badfile || c == :ssl_engine_setfailed || c == :ssl_engine_notfound ) end def get_root_path_return_code Browser.get(@uri.to_s).return_code end # Checks if the remote website is up. def online? Browser.get(@uri.to_s).code != 0 end def has_basic_auth? Browser.get(@uri.to_s).code == 401 end def has_xml_rpc? response = Browser.get_and_follow_location(xml_rpc_url) response.body =~ %r{XML-RPC server accepts POST requests only}i end # See http://www.hixie.ch/specs/pingback/pingback-1.0#TOC2.3 def xml_rpc_url unless @xmlrpc_url @xmlrpc_url = @uri.merge('xmlrpc.php').to_s end @xmlrpc_url end # See if the remote url returns 30x redirect # This method is recursive # Return a string with the redirection or nil def redirection(url = nil) redirection = nil url ||= @uri.to_s response = Browser.get(url) redirected_uri = URI.parse(add_trailing_slash(add_http_protocol(url))) if response.code == 301 || response.code == 302 redirection = redirected_uri.merge(response.headers_hash['location']).to_s return redirection if url == redirection # prevents infinite loop # Let's check if there is a redirection in the redirection if other_redirection = redirection(redirection) redirection = other_redirection end end redirection end # Compute the MD5 of the page # Comments are deleted from the page to avoid cache generation details # # @param [ String, Typhoeus::Response ] page The url of the response of the page # # @return [ String ] The MD5 hash of the page def self.page_hash(page) page = Browser.get(page, { followlocation: true, cache_ttl: 0 }) unless page.is_a?(Typhoeus::Response) Digest::MD5.hexdigest(page.body.gsub(//m, '')) end def homepage_hash unless @homepage_hash @homepage_hash = WebSite.page_hash(@uri.to_s) end @homepage_hash end # Return the MD5 hash of a 404 page def error_404_hash unless @error_404_hash non_existant_page = Digest::MD5.hexdigest(rand(999_999_999).to_s) + '.html' @error_404_hash = WebSite.page_hash(@uri.merge(non_existant_page).to_s) end @error_404_hash end # Will try to find the rss url in the homepage # Only the first one found is returned def rss_url homepage_body = Browser.get(@uri.to_s).body homepage_body[%r{}, 1] end # Only the first 700 bytes are checked to avoid the download # of the whole file which can be very huge (like 2 Go) # # @param [ String ] log_url # @param [ RegEx ] pattern # # @return [ Boolean ] def self.has_log?(log_url, pattern) log_body = Browser.get(log_url, headers: {'range' => 'bytes=0-700'}).body log_body[pattern] ? true : false end end