parse robots.txt

This commit is contained in:
Christian Mehlmauer
2013-08-10 11:35:17 +02:00
parent 6c8e76060a
commit 6c008015e9
10 changed files with 171 additions and 33 deletions

View File

@@ -1,6 +1,9 @@
# encoding: UTF-8
require 'web_site/robots_txt'
class WebSite
include WebSite::RobotsTxt
attr_reader :uri
@@ -94,18 +97,6 @@ class WebSite
homepage_body[%r{<link .* type="application/rss\+xml" .* href="([^"]+)" />}, 1]
end
# Checks if a robots.txt file exists
def has_robots?
Browser.get(robots_url).code == 200
end
# Gets a robots.txt URL
#
# @return [ String ]
def robots_url
@uri.merge('robots.txt').to_s
end
# Only the first 700 bytes are checked to avoid the download
# of the whole file which can be very huge (like 2 Go)
#