remove scripts before calculating hashes

This commit is contained in:
Christian Mehlmauer
2016-10-25 20:44:00 +02:00
parent b08e298eba
commit 3c74ee8d97
2 changed files with 17 additions and 3 deletions

View File

@@ -91,15 +91,18 @@ class WebSite
end
# Compute the MD5 of the page
# Comments are deleted from the page to avoid cache generation details
# Comments and scripts are deleted from the page to avoid cache generation details
#
# @param [ String, Typhoeus::Response ] page The url of the response of the page
#
# @return [ String ] The MD5 hash of the page
def self.page_hash(page)
page = Browser.get(page, { followlocation: true, cache_ttl: 0 }) unless page.is_a?(Typhoeus::Response)
Digest::MD5.hexdigest(page.body.gsub(/<!--.*?-->/m, ''))
# remove comments
page = page.body.gsub(/<!--.*?-->/m, '')
# remove javascript stuff
page = page.gsub(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/m, '')
Digest::MD5.hexdigest(page)
end
def homepage_hash

View File

@@ -176,6 +176,17 @@ describe 'WebSite' do
@expected = "yolo\n\n\nworld!"
end
end
context 'when there are scripts' do
let(:page) {
body = "yolo\n\n<script type=\"text/javascript\">alert('Hi');</script>\nworld!"
Typhoeus::Response.new(body: body)
}
it 'removes them' do
@expected = "yolo\n\n\nworld!"
end
end
end
describe '#homepage_hash' do