67 lines
1.4 KiB
Ruby
67 lines
1.4 KiB
Ruby
# encoding: UTF-8
|
|
|
|
class WebSite
|
|
module RobotsTxt
|
|
|
|
# Checks if a robots.txt file exists
|
|
# @return [ Boolean ]
|
|
def has_robots?
|
|
Browser.get(robots_url).code == 200
|
|
end
|
|
|
|
# Gets a robots.txt URL
|
|
# @return [ String ]
|
|
def robots_url
|
|
@uri.clone.merge('robots.txt').to_s
|
|
end
|
|
|
|
# Parse robots.txt
|
|
# @return [ Array ] URLs generated from robots.txt
|
|
def parse_robots_txt
|
|
return unless has_robots?
|
|
|
|
return_object = []
|
|
response = Browser.get(robots_url.to_s)
|
|
body = response.body
|
|
# Get all allow and disallow urls
|
|
entries = body.scan(/^(?:dis)?allow:\s*(.*)$/i)
|
|
if entries
|
|
entries.flatten!
|
|
entries.compact.sort!
|
|
entries.uniq!
|
|
wordpress_path = @uri.path
|
|
RobotsTxt.known_dirs.each do |d|
|
|
entries.delete(d)
|
|
# also delete when wordpress is installed in subdir
|
|
dir_with_subdir = "#{wordpress_path}/#{d}".gsub(/\/+/, '/')
|
|
entries.delete(dir_with_subdir)
|
|
end
|
|
|
|
entries.each do |d|
|
|
begin
|
|
temp = @uri.clone
|
|
temp.path = d.strip
|
|
rescue URI::Error
|
|
temp = d.strip
|
|
end
|
|
return_object << temp.to_s
|
|
end
|
|
end
|
|
return_object
|
|
end
|
|
|
|
protected
|
|
|
|
# @return [ Array ]
|
|
def self.known_dirs
|
|
%w{
|
|
/
|
|
/wp-admin/
|
|
/wp-includes/
|
|
/wp-content/
|
|
}
|
|
end
|
|
|
|
end
|
|
end
|