parse robots.txt

2013-08-10 11:35:17 +02:00
parent 6c8e76060a
commit 6c008015e9
10 changed files with 171 additions and 33 deletions
--- a/lib/wpscan/web_site/robots_txt.rb
+++ b/lib/wpscan/web_site/robots_txt.rb
@@ -0,0 +1,64 @@
+# encoding: UTF-8
+
+class WebSite
+  module RobotsTxt
+
+    # Checks if a robots.txt file exists
+    # @return [ Boolean ]
+    def has_robots?
+      Browser.get(robots_url).code == 200
+    end
+
+    # Gets a robots.txt URL
+    # @return [ String ]
+    def robots_url
+      temp = @uri.clone
+      temp.path = '/robots.txt'
+      temp.to_s
+    end
+
+
+    # Parse robots.txt
+    # @return [ Array ] URLs generated from robots.txt
+    def parse_robots_txt
+      return unless has_robots?
+
+      return_object = []
+      response = Browser.get(robots_url.to_s)
+      body = response.body
+      # Get all allow and disallow urls
+      entries = body.scan(/^(?:dis)?allow:\s*(.*)$/i)
+      if entries
+        entries.flatten!
+        entries.compact.sort!
+        wordpress_path = @uri.path
+        RobotsTxt.known_dirs.each do |d|
+          entries.delete(d)
+          # also delete when wordpress is installed in subdir
+          dir_with_subdir = "#{wordpress_path}/#{d}".gsub(/\/+/, '/')
+          entries.delete(dir_with_subdir)
+        end
+
+        entries.each do |d|
+          temp = @uri.clone
+          temp.path = d
+          return_object << temp.to_s
+        end
+      end
+      return_object
+    end
+
+    protected
+
+    # @return [ Array ]
+    def self.known_dirs
+      %w{
+        /
+        /wp-admin/
+        /wp-includes/
+        /wp-content/
+      }
+    end
+
+  end
+end