parse robots.txt

2013-08-10 11:35:17 +02:00
parent 6c8e76060a
commit 6c008015e9
10 changed files with 171 additions and 33 deletions
--- a/lib/wpscan/web_site.rb
+++ b/lib/wpscan/web_site.rb
@@ -1,6 +1,9 @@
 # encoding: UTF-8

+require 'web_site/robots_txt'
+
 class WebSite
+  include WebSite::RobotsTxt

  attr_reader :uri

@@ -94,18 +97,6 @@ class WebSite
    homepage_body[%r{<link .* type="application/rss\+xml" .* href="([^"]+)" />}, 1]
  end

-  # Checks if a robots.txt file exists
-  def has_robots?
-    Browser.get(robots_url).code == 200
-  end
-
-  # Gets a robots.txt URL
-  #
-  # @return [ String ]
-  def robots_url
-    @uri.merge('robots.txt').to_s
-  end
-
  # Only the first 700 bytes are checked to avoid the download
  # of the whole file which can be very huge (like 2 Go)
  #
--- a/lib/wpscan/web_site/robots_txt.rb
+++ b/lib/wpscan/web_site/robots_txt.rb
@@ -0,0 +1,64 @@
+# encoding: UTF-8
+
+class WebSite
+  module RobotsTxt
+
+    # Checks if a robots.txt file exists
+    # @return [ Boolean ]
+    def has_robots?
+      Browser.get(robots_url).code == 200
+    end
+
+    # Gets a robots.txt URL
+    # @return [ String ]
+    def robots_url
+      temp = @uri.clone
+      temp.path = '/robots.txt'
+      temp.to_s
+    end
+
+
+    # Parse robots.txt
+    # @return [ Array ] URLs generated from robots.txt
+    def parse_robots_txt
+      return unless has_robots?
+
+      return_object = []
+      response = Browser.get(robots_url.to_s)
+      body = response.body
+      # Get all allow and disallow urls
+      entries = body.scan(/^(?:dis)?allow:\s*(.*)$/i)
+      if entries
+        entries.flatten!
+        entries.compact.sort!
+        wordpress_path = @uri.path
+        RobotsTxt.known_dirs.each do |d|
+          entries.delete(d)
+          # also delete when wordpress is installed in subdir
+          dir_with_subdir = "#{wordpress_path}/#{d}".gsub(/\/+/, '/')
+          entries.delete(dir_with_subdir)
+        end
+
+        entries.each do |d|
+          temp = @uri.clone
+          temp.path = d
+          return_object << temp.to_s
+        end
+      end
+      return_object
+    end
+
+    protected
+
+    # @return [ Array ]
+    def self.known_dirs
+      %w{
+        /
+        /wp-admin/
+        /wp-includes/
+        /wp-content/
+      }
+    end
+
+  end
+end
--- a/lib/wpscan/wp_target/interesting_headers.rb
+++ b/lib/wpscan/wp_target/interesting_headers.rb
@@ -4,6 +4,7 @@ class WpTarget < WebSite
  module InterestingHeaders

    # Checks for interesting headers
+    # @return [ Array ] Interesting Headers
    def interesting_headers
      response = Browser.head(@uri.to_s)
      headers = response.headers
@@ -15,7 +16,7 @@ class WpTarget < WebSite

    protected

-    # @return Array
+    # @return [ Array ]
    def self.known_headers
      %w{
        Location
--- a/lib/wpscan/wp_target/wp_config_backup.rb
+++ b/lib/wpscan/wp_target/wp_config_backup.rb
@@ -5,7 +5,7 @@ class WpTarget < WebSite

    # Checks to see if wp-config.php has a backup
    # See http://www.feross.org/cmsploit/
-    # return an array of backup config files url
+    # @return [ Array ] Backup config files
    def config_backup
      found       = []
      backups     = WpConfigBackup.config_backup_files
@@ -37,7 +37,7 @@ class WpTarget < WebSite
      found
    end

-    # @return Array
+    # @return [ Array ]
    def self.config_backup_files
      %w{
        wp-config.php~ #wp-config.php# wp-config.php.save wp-config.php.swp wp-config.php.swo wp-config.php_bak
--- a/spec/lib/wpscan/web_site_spec.rb
+++ b/spec/lib/wpscan/web_site_spec.rb
@@ -6,6 +6,8 @@ describe 'WebSite' do
  let(:fixtures_dir) { SPEC_FIXTURES_WPSCAN_WEB_SITE_DIR }
  subject(:web_site) { WebSite.new('http://example.localhost/') }

+  it_behaves_like 'WebSite::RobotsTxt'
+
  before :all do
    Browser::reset
    Browser.instance(
@@ -147,24 +149,6 @@ describe 'WebSite' do
    end
  end

-  describe '#robots_url' do
-    it 'returns the correct url' do
-      web_site.robots_url.should === 'http://example.localhost/robots.txt'
-    end
-  end
-
-  describe '#has_robots?' do
-    it 'returns true' do
-      stub_request(:get, web_site.robots_url).to_return(status: 200)
-      web_site.has_robots?.should be_true
-    end
-
-    it 'returns false' do
-      stub_request(:get, web_site.robots_url).to_return(status: 404)
-      web_site.has_robots?.should be_false
-    end
-  end
-
  describe '::has_log?' do
    let(:log_url) { web_site.uri.merge('log.txt').to_s }
    let(:pattern) { %r{PHP Fatal error} }
--- a/spec/samples/wpscan/web_site/robots_txt/empty_robots.txt
+++ b/spec/samples/wpscan/web_site/robots_txt/empty_robots.txt
--- a/spec/samples/wpscan/web_site/robots_txt/invalid_robots.txt
+++ b/spec/samples/wpscan/web_site/robots_txt/invalid_robots.txt
@@ -0,0 +1,6 @@
+ölhkdfys
+opihufgcasfa
+dsds
+fsdf
+s
+dtf
--- a/spec/samples/wpscan/web_site/robots_txt/robots.txt
+++ b/spec/samples/wpscan/web_site/robots_txt/robots.txt
@@ -0,0 +1,10 @@
+User-agent: *
+Disallow: /wp-admin/
+Disallow: /wp-includes/
+Disallow: /wordpress/admin/
+Disallow: /wordpress/wp-admin/
+Disallow: /wordpress/secret/
+Disallow: /Wordpress/wp-admin/
+Allow: /asdf/
+
+Sitemap: http://10.0.0.0/sitemap.xml.gz
--- a/spec/shared_examples/web_site/robots_txt.rb
+++ b/spec/shared_examples/web_site/robots_txt.rb
@@ -0,0 +1,78 @@
+# encoding: UTF-8
+
+shared_examples 'WebSite::RobotsTxt' do
+  let(:known_dirs) { WebSite::RobotsTxt.known_dirs }
+
+  describe '#robots_url' do
+    it 'returns the correct url' do
+      web_site.robots_url.should === 'http://example.localhost/robots.txt'
+    end
+  end
+
+  describe '#has_robots?' do
+    it 'returns true' do
+      stub_request(:get, web_site.robots_url).to_return(status: 200)
+      web_site.has_robots?.should be_true
+    end
+
+    it 'returns false' do
+      stub_request(:get, web_site.robots_url).to_return(status: 404)
+      web_site.has_robots?.should be_false
+    end
+  end
+
+  describe '#parse_robots_txt' do
+
+    context 'installed in root' do
+      after :each do
+        stub_request_to_fixture(url: web_site.robots_url, fixture: @fixture)
+        robots = web_site.parse_robots_txt
+        robots.should =~ @expected
+      end
+
+      it 'returns an empty Array (empty robots.txt)' do
+        @fixture = fixtures_dir + '/robots_txt/empty_robots.txt'
+        @expected = []
+      end
+
+      it 'returns an empty Array (invalid robots.txt)' do
+        @fixture = fixtures_dir + '/robots_txt/invalid_robots.txt'
+        @expected = []
+      end
+
+      it 'returns an Array of urls (valid robots.txt)' do
+        @fixture = fixtures_dir + '/robots_txt/robots.txt'
+        @expected = %w(
+          http://example.localhost/wordpress/admin/
+          http://example.localhost/wordpress/wp-admin/
+          http://example.localhost/wordpress/secret/
+          http://example.localhost/Wordpress/wp-admin/
+          http://example.localhost/asdf/
+        )
+      end
+    end
+
+    context 'installed in sub directory' do
+      it 'returns an Array of urls (valid robots.txt, WP installed in subdir)' do
+        web_site_sub = WebSite.new('http://example.localhost/wordpress/')
+        fixture = fixtures_dir + '/robots_txt/robots.txt'
+        expected = %w(
+            http://example.localhost/wordpress/admin/
+            http://example.localhost/wordpress/secret/
+            http://example.localhost/Wordpress/wp-admin/
+            http://example.localhost/asdf/
+          )
+        stub_request_to_fixture(url: web_site_sub.robots_url, fixture: fixture)
+        robots = web_site_sub.parse_robots_txt
+        robots.should =~ expected
+      end
+    end
+  end
+
+  describe '#known_dirs' do
+    it 'does not contain duplicates' do
+      known_dirs.flatten.uniq.length.should == known_dirs.length
+    end
+  end
+
+end
--- a/wpscan.rb
+++ b/wpscan.rb
@@ -105,6 +105,10 @@ def main

    if wp_target.has_robots?
      puts green('[+]') + " robots.txt available under '#{wp_target.robots_url}'"
+
+      wp_target.parse_robots_txt.each do |dir|
+        puts "#{green('[+]')} Interesting entry from robots.txt: #{dir}"
+      end
    end

    if wp_target.has_readme?