diff --git a/lib/wpscan/web_site.rb b/lib/wpscan/web_site.rb index 5b2b445c..cfa67389 100644 --- a/lib/wpscan/web_site.rb +++ b/lib/wpscan/web_site.rb @@ -4,12 +4,14 @@ require 'web_site/robots_txt' require 'web_site/humans_txt' require 'web_site/security_txt' require 'web_site/interesting_headers' +require 'web_site/sitemap' class WebSite include WebSite::RobotsTxt include WebSite::HumansTxt include WebSite::SecurityTxt include WebSite::InterestingHeaders + include WebSite::Sitemap attr_reader :uri diff --git a/lib/wpscan/web_site/sitemap.rb b/lib/wpscan/web_site/sitemap.rb new file mode 100644 index 00000000..f6aae16e --- /dev/null +++ b/lib/wpscan/web_site/sitemap.rb @@ -0,0 +1,68 @@ +# encoding: UTF-8 + +class WebSite + module Sitemap + + # Checks if a sitemap.txt file exists + # @return [ Boolean ] + def has_sitemap? + # Make the request + response = Browser.get(sitemap_url) + + # Make sure its HTTP 200 + return false unless response.code == 200 + + # Is there a sitemap value? + result = response.body.scan(/^sitemap\s*:\s*(.*)$/i) + return true if result[0] + return false + end + + # Gets a robots.txt URL + # @return [ String ] + def sitemap_url + @uri.clone.merge('robots.txt').to_s + end + + # Parse robots.txt + # @return [ Array ] URLs generated from robots.txt + def parse_sitemap + return_object = [] + + # Make request + response = Browser.get(sitemap_url.to_s) + body = response.body + + # Get all allow and disallow urls + entries = body.scan(/^sitemap\s*:\s*(.*)$/i) + + # Did we get something? + if entries + # Extract elements + entries.flatten! + # Remove any leading/trailing spaces + entries.collect{|x| x.strip || x } + # End Of Line issues + entries.collect{|x| x.chomp! || x } + # Remove nil's and sort + entries.compact.sort! + # Unique values only + entries.uniq! + + # Each value now, try and make it a full URL + entries.each do |d| + begin + temp = @uri.clone + temp.path = d.strip + rescue URI::Error + temp = d.strip + end + return_object << temp.to_s + end + + end + return_object + end + + end +end