parse robots.txt

This commit is contained in:
Christian Mehlmauer
2013-08-10 11:35:17 +02:00
parent 6c8e76060a
commit 6c008015e9
10 changed files with 171 additions and 33 deletions

View File

@@ -1,6 +1,9 @@
# encoding: UTF-8
require 'web_site/robots_txt'
class WebSite
include WebSite::RobotsTxt
attr_reader :uri
@@ -94,18 +97,6 @@ class WebSite
homepage_body[%r{<link .* type="application/rss\+xml" .* href="([^"]+)" />}, 1]
end
# Checks if a robots.txt file exists
def has_robots?
Browser.get(robots_url).code == 200
end
# Gets a robots.txt URL
#
# @return [ String ]
def robots_url
@uri.merge('robots.txt').to_s
end
# Only the first 700 bytes are checked to avoid the download
# of the whole file which can be very huge (like 2 Go)
#

View File

@@ -0,0 +1,64 @@
# encoding: UTF-8
class WebSite
module RobotsTxt
# Checks if a robots.txt file exists
# @return [ Boolean ]
def has_robots?
Browser.get(robots_url).code == 200
end
# Gets a robots.txt URL
# @return [ String ]
def robots_url
temp = @uri.clone
temp.path = '/robots.txt'
temp.to_s
end
# Parse robots.txt
# @return [ Array ] URLs generated from robots.txt
def parse_robots_txt
return unless has_robots?
return_object = []
response = Browser.get(robots_url.to_s)
body = response.body
# Get all allow and disallow urls
entries = body.scan(/^(?:dis)?allow:\s*(.*)$/i)
if entries
entries.flatten!
entries.compact.sort!
wordpress_path = @uri.path
RobotsTxt.known_dirs.each do |d|
entries.delete(d)
# also delete when wordpress is installed in subdir
dir_with_subdir = "#{wordpress_path}/#{d}".gsub(/\/+/, '/')
entries.delete(dir_with_subdir)
end
entries.each do |d|
temp = @uri.clone
temp.path = d
return_object << temp.to_s
end
end
return_object
end
protected
# @return [ Array ]
def self.known_dirs
%w{
/
/wp-admin/
/wp-includes/
/wp-content/
}
end
end
end

View File

@@ -4,6 +4,7 @@ class WpTarget < WebSite
module InterestingHeaders
# Checks for interesting headers
# @return [ Array ] Interesting Headers
def interesting_headers
response = Browser.head(@uri.to_s)
headers = response.headers
@@ -15,7 +16,7 @@ class WpTarget < WebSite
protected
# @return Array
# @return [ Array ]
def self.known_headers
%w{
Location

View File

@@ -5,7 +5,7 @@ class WpTarget < WebSite
# Checks to see if wp-config.php has a backup
# See http://www.feross.org/cmsploit/
# return an array of backup config files url
# @return [ Array ] Backup config files
def config_backup
found = []
backups = WpConfigBackup.config_backup_files
@@ -37,7 +37,7 @@ class WpTarget < WebSite
found
end
# @return Array
# @return [ Array ]
def self.config_backup_files
%w{
wp-config.php~ #wp-config.php# wp-config.php.save wp-config.php.swp wp-config.php.swo wp-config.php_bak