Created
September 8, 2012 13:07
-
-
Save addisaden/3674778 to your computer and use it in GitHub Desktop.
Ein Skript, was ich vor nem Jahr als Prototyp entworfen habe um Webseiten zu analysieren
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require "net/http" | |
require "readline" | |
class Website | |
############---------------------------------------------------------- CLASS | |
@@hosts ={} # "www.23g.eu" => WebsiteObj | |
@@sessionname = nil | |
def self.console | |
loop { | |
input = Readline.readline(">> ", true).strip | |
case input | |
when /^exit/i then break | |
when /^add\s+\S+$/i then add( input.split[1], true ) | |
when /^list$/i then list | |
when /^ls$/i then list | |
when /^rename$/i then renameSession | |
when /^save$/i then saveSession | |
when /^load$/i then loadSession | |
when /^help$/i then helpConsole | |
when /^reset$/i then resetSession | |
when /^cd\s+\S+$/i then openHost( input.split[1] ) | |
when /^remove\s+\S+$/i then removeHost (input.split[1]) | |
when /^remove\s+r\s+\S+$/i then removeRegexHost (input.split[2]) | |
else helpConsole | |
end | |
} | |
end | |
def self.removeRegexHost(host) | |
begin | |
toDelete = @@hosts.keys.sort.select { |s| s =~ Regexp.new(host, Regexp::MULTILINE+Regexp::IGNORECASE) } | |
raise NameError, "Keine übereinstimmenden Hosts gefunden." if toDelete.empty? | |
toDelete.each { |h| | |
@@hosts.delete(h) | |
} | |
rescue => msg | |
puts "Error :: #{msg}" | |
end | |
end | |
def self.removeHost(host) | |
begin | |
unless @@hosts.keys.include? host | |
raise NameError, "#{host} konnte nicht gefunden werden. Versuchen Sie remove r <regularexpression>." | |
end | |
@@hosts.delete(host) | |
rescue => msg | |
puts "Error :: #{msg}" | |
end | |
end | |
def self.openHost(hostkey) | |
if @@hosts.keys.include? hostkey then | |
@@hosts[hostkey].console | |
else | |
puts "Host #{hostkey} wurde nicht gefunden." | |
end | |
end | |
def self.resetSession | |
print "Sind Sie sicher, dass Sie die Session komplett zurücksetzen wollen? (ja/nein) " | |
unless( gets =~ /ja?/i ) | |
puts "Zurücksetzen der Session abgebrochen." | |
return false | |
end | |
print "Wollen Sie die aktuelle Session vorher sichern? (ja/nein) " | |
if( gets =~ /ja?/i) | |
unless saveSession then | |
return false | |
end | |
end | |
@@hosts = {} | |
@@sessionname = nil | |
puts "Session wurde erfolgreich zurückgesetzt." | |
return true | |
end | |
def self.helpConsole | |
puts "\n__Hilfe__" | |
puts | |
puts " Befehlsübersicht:" | |
puts "add <link> - Fügt einen Link in die aktuelle Datenbank" | |
puts "list - Listet die aktuellen Hosts auf" | |
puts "ls - Synonym für list" | |
puts "cd <host> - Open <host>" | |
puts "remove <host> - Löscht den angegebenen Host" | |
puts "remove r <regularexpression> - Löscht die gefundenen Hosts (RegExp)" | |
puts "rename - Umbennenung der Session" | |
puts "save - Sichern der Session" | |
puts "load - Laden einer Session" | |
puts "help - Gibt diesen Text aus" | |
puts "exit - Beendet die Console" | |
end | |
def self.renameSession | |
puts "Aktueller Sessionname: #{ @@sessionname || 'Noch kein Sessionname vorhanden.'}" | |
print "Sessionname: " | |
while ((@@sessionname = gets.strip) =~ /[^\w\d]+/i) | |
print "Ungültiger Name, bitte nochmal eingeben: " | |
end | |
end | |
def self.saveSession | |
unless @@sessionname | |
renameSession | |
end | |
if(File.exist? "#{ @@sessionname }.session") | |
print "Session vorhanden. Soll diese überschrieben werden? (ja/nein) " | |
unless( gets =~ /ja?/i ) | |
puts "Speicherung der Session abgebrochen." | |
return false | |
end | |
end | |
begin | |
saveInhalt = Marshal::dump(@@hosts) | |
File.open("#{ @@sessionname }.session", "w") { |f| | |
f.puts saveInhalt | |
} | |
puts "Speicherung der Session erfolgreich." | |
rescue => msg | |
puts "Etwas ist bei der Speicherung schief gelaufen.\nERR: #{msg} \n" | |
return false | |
end | |
return true | |
end | |
def self.loadSession | |
# test if current session not empty | |
unless(@@hosts == {}) | |
print "Aktuelle Session ist nicht leer, soll diese überschrieben werden? (ja/nein) " | |
unless( gets =~ /ja?/i ) | |
puts "Laden der Session abgebrochen." | |
return false | |
end | |
end | |
# list all saved sessions | |
sessions = Dir["*.session"].collect { |s| s.gsub(/\.session/i, "") } | |
puts sessions.join(", ") | |
# choose a session | |
print "Sessionnamen wählen: " | |
sname = gets.strip | |
unless(File.exist? "#{ sname }.session") | |
puts "Session nicht vorhanden." | |
return false | |
else | |
# load session | |
@@hosts = Marshal::load( File.read( "#{sname}.session" ) ) | |
@@sessionname = sname | |
puts "Laden der Session erfolgreich." | |
end | |
return true | |
end | |
def self.list | |
k = @@hosts.keys | |
puts k.sort.join(", ") | |
end | |
def self.add(link,mainloop=false) | |
unless link =~ /^https?:\/\//i | |
link = "http://#{ link }" | |
end | |
uri = URI(link) | |
print "Search for host \"#{ uri.host }\" :: " | |
if( @@hosts.keys.include? uri.host ) then | |
puts "OK" | |
# add link to host | |
print "Add link to \"#{ uri.host }\" :: " | |
@@hosts[uri.host] << link | |
puts "OK" | |
else | |
puts "Not found" | |
print "Save new Host \"#{ uri.host }\" :: " | |
w = Website.new uri.host | |
puts "OK" | |
puts "Added link to \"#{ uri.host }\" :: OK" | |
w << link | |
end | |
@@hosts[uri.host].console if mainloop | |
end | |
############---------------------------------------------------------- Object | |
def initialize(host) | |
@host = host | |
@intern = {} | |
@extern = {} | |
@@hosts[@host] = self | |
end | |
def console(nesting="") | |
nesting += " > " if nesting != "" | |
nesting += "#{ @host }" | |
loop { | |
input = Readline.readline("(#{ nesting }) >> ", true).strip | |
case input | |
when /^cd\s+\.\./i then break | |
when /^help$/i then helpConsole | |
when /^scan\s+extern\s\S+$/i then scanExternHost(input.split[2], nesting) | |
when /^scan\s+extern$/i then scanExtern | |
when /^scan$/i then scanIntern | |
when /^scan\s+complete$/i then scanCompleteIntern | |
when /^scan\s+all$/i then scanAll | |
when /^extend$/i then extendIntern | |
when /^extend\s+\S+$/i then extendIntern(input.split[1]) | |
when /^extend\s+\S+\s+\S+$/i then extendIntern(input.split[1], input.split[2]) | |
when /^show\s+extend$/i then showExtend | |
when /^show\s+links?$/i then showLinks | |
when /^show\s+words?$/i then showWords | |
when /^show\s+source$/i then showPageSource | |
when /^show$/i then statusIntern | |
when /^ls$/i then statusIntern | |
when /^cd\s+\S+$/i then openHost( input.split[1], nesting ) | |
when /^search\s+\S+$/i then searchContent(input.split[1]) | |
when /^body\s+\S+$/i then searchBody(input.split[1]) | |
when /^bodylinks$/i then bodylinks | |
when /^add\s+intern\s+\S+$/i then addInternLink(input.split[2]) | |
when /^add\s+\S+$/i then self << (input.split[1]) | |
else helpConsole | |
end | |
} | |
end | |
def helpConsole | |
puts "\n__Hilfe__Host__" | |
puts | |
puts " Befehlsübersicht:" | |
puts "search <regexp> - search with regex" | |
puts "body <regexp> - search bodys with regex" | |
puts "bodylinks - makro zum auflisten aller href-Links" | |
puts "cd .. - Geht wieder zum Hauptmenu" | |
puts "cd <host> - Öffne Host" | |
puts "show - Zeigt status des Hosts" | |
puts "ls - Synonym für show" | |
puts "show extend - Zeigt alle Links die mit extend aktiviert werden können" | |
puts "show source - Seite auswählen und Quelltext anzeigen lassen" | |
puts "show links - Zeigt die Links an" | |
puts "show words - Zählt die Wörter und bildet ein Ranking" | |
puts "scan - Scannt interne Links" | |
puts "scan extern - Scannt externe Links" | |
puts "scan extern <host> - Scannt externe Links von angegebenen Host" | |
puts "scan complete - Scannt den kompletten internen Bereich" | |
puts "scan all - Scannt interne und externe links" | |
puts "extend - Pusht Links von gescannten Links in die Datenbank" | |
puts "extend <f> - (regexp) wie extend, f => filter für links die extended werden sollen" | |
puts "extend <f> <df> - (regexp) wie extend, f => wie oben, df => filter für links die extended werden" | |
puts "add <link> - Einen Link hinzufügen" | |
puts "add intern <link> - Einen internen Link hinzufügen -> Bsp. http://xyz.net/new : add intern new" | |
# puts "cd <host> - Open <host>" | |
puts "help - Gibt diesen Text aus" | |
end | |
def showPageSource | |
k = (@intern.select { |k,v| v[:scanned] == true}).keys | |
k.each_index { |i| | |
puts "#{ i+1 }\t-> #{ k[i].strip }" | |
} | |
print "Bitte wählen Sie den Link aus: " | |
auswahl = gets.to_i - 1 | |
begin | |
s = k[auswahl] | |
puts "#{ s } :: SOURCE" | |
sou = @intern[s][:response].body | |
soup = sou.split(/\r?\n/) | |
(soup.length - 1).times { |i| | |
if soup[i] =~ /\S/i then | |
case soup[i] | |
when /<body[\s>]/i then puts "\n\n___BODY_______________________________\n\n" | |
when /<html[\s>]/i then puts "\n\n___HTML_______________________________\n\n" | |
when /<head[\s>]/i then puts "\n\n___HEAD_______________________________\n\n" | |
when /<title[\s>]/i then puts "\n\n___TITLE______#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
when /<h1[\s>]/i then puts "\n\n___HEADER_1___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
when /<h2[\s>]/i then puts "\n\n___HEADER_2___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
when /<h3[\s>]/i then puts "\n\n___HEADER_3___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
when /<h4[\s>]/i then puts "\n\n___HEADER_4___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
when /<h5[\s>]/i then puts "\n\n___HEADER_5___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n" | |
end | |
puts "#{ i }\t|#{ soup[i].strip }" | |
end | |
} | |
puts | |
rescue => msg | |
puts "\nError :: #{ msg }" | |
end | |
end | |
def scanAll | |
scanIntern | |
scanExtern | |
end | |
def scanCompleteIntern | |
xyz = 0 | |
extendIntern | |
while (@intern.select { |k,v| v[:scanned] == true}).keys.length < @intern.keys.length or (@intern.select { |k,v| v[:extend] == true}).keys.length < @intern.keys.length do | |
extendIntern | |
scanIntern | |
if xyz % 2 == 1 then | |
print "Wollen Sie noch weiter scannen? (ja/nein) " | |
unless( gets =~ /ja?/i ) | |
puts "Komplettes scannen abgebrochen." | |
break | |
end | |
end | |
xyz += 1 | |
end | |
end | |
def bodylinks | |
searchBody "href\\s*=\\s*\"[^\"]+\"" | |
end | |
def addInternLink(link) | |
self << "http://#{ @host }/#{ link }" | |
end | |
def searchBody(regex) | |
begin | |
r = Regexp.new(regex, Regexp::MULTILINE+Regexp::IGNORECASE) | |
allBody = {} | |
scanned = @intern.select { |k,v| v[:scanned] == true } | |
scanned.each { |k,v| | |
if v[:response] then | |
v[:response].body.scan(r).each { |c| | |
allBody[k] = (allBody[k] || []) + [c.gsub(/\r?\n/m, " ")] | |
} | |
end | |
} | |
allBody.each { |k,v| | |
puts "#{ k }:\n-> #{ v.join("\n-> ") }\n\n\n" | |
} | |
rescue => msg | |
puts "Error :: #{ msg }" | |
end | |
end | |
def searchContent(regex) | |
begin | |
r = Regexp.new(regex, Regexp::MULTILINE+Regexp::IGNORECASE) | |
allContent = {} | |
scanned = @intern.select { |k,v| v[:scanned] == true } | |
scanned.each { |k,v| | |
v[:content].each { |c| | |
if ( c =~ r ) then | |
allContent[k] = (allContent[k] || []) + [c.gsub(/\r?\n/m, " ")] | |
end | |
} | |
} | |
puts "\n\n\n***************************************\n\n\n" | |
allContent.each { |k,v| | |
puts "#{ k }:\n-> #{ v.join("\n\n-> ") }\n\n\n" | |
} | |
rescue => msg | |
puts "Error :: #{ msg }" | |
end | |
return true | |
end | |
def openHost(hostkey, nesting="") | |
if @@hosts.keys.include? hostkey then | |
@@hosts[hostkey].console(nesting) | |
else | |
puts "Host \"#{hostkey}\" unbekannt. Evtl sollten Sie den Befehl \"scan extern\" verwenden." | |
end | |
end | |
def statusIntern | |
puts | |
puts "Hostname | #{ @host }" | |
puts "Content | #{ (@intern.select { |k,v| v[:scanned] == true and v[:content].length > 0}).values.collect {|v| v[:content].length }.inject {|x,y| x+y} }" | |
puts "interne gescannte Links | #{(@intern.select { |k,v| v[:scanned] == true}).keys.length }/#{ @intern.keys.length }" | |
puts "interne extended Links | #{(@intern.select { |k,v| v[:extend] == true}).keys.length }/#{ @intern.keys.length }" | |
puts "externe gescannte Links | #{(@extern.select { |k,v| v[:scanned] == true}).keys.length }/#{ @extern.keys.length }" | |
puts "externe Hosts | #{ (@extern.keys.collect { |k| URI(k).host }).uniq.sort.join(", ") }" | |
end | |
def scanIntern | |
toScan = @intern.select { |k,v| v[:scanned] == false } | |
toScan.keys.each { |k| | |
scan(k) | |
} | |
end | |
def extendIntern(filter=nil, linkfilter=nil) | |
scanned = {} | |
if(filter==nil) | |
scanned = @intern.select { |k,v| v[:scanned] == true } | |
else | |
begin | |
r = Regexp.new(filter, Regexp::MULTILINE+Regexp::IGNORECASE) | |
scanned = @intern.select { |k,v| v[:scanned] == true and k =~ r } | |
rescue => msg | |
puts "Error :: #{ msg }" | |
return false | |
end | |
end | |
toExtend = [] | |
scanned.each { |k,v| | |
v[:links].each { |l| | |
unless(linkfilter == nil) | |
begin | |
r = Regexp.new(linkfilter, Regexp::MULTILINE+Regexp::IGNORECASE) | |
if( l =~ r) | |
toExtend << l | |
end | |
rescue => msg | |
puts "Error :: #{ msg }" | |
toExtend << l | |
end | |
else | |
toExtend << l | |
end | |
} | |
v[:extend] = true if filter==nil | |
} | |
toExtend.each { |l| | |
unless(@intern.keys.include? l or @extern.keys.include? l) | |
begin | |
uri = URI(l) | |
linkhost = uri.host | |
if linkhost == @host then | |
@intern[l] = { | |
:uri => uri, | |
:scanned => false, | |
:extend => false, | |
:response => nil, | |
:title => "", | |
:links => [], | |
:content => [] | |
} | |
else | |
@extern[l] = { | |
:uri => uri, | |
:scanned => false | |
} | |
end | |
rescue | |
@intern.delete(l) | |
@extern.delete(l) | |
puts "Fehlerhafter Link \"#{ l }\" wurde gelöscht." | |
end | |
end | |
} | |
end | |
def scanExtern | |
toScan = @extern.select { |k,v| v[:scanned] == false } | |
toScan.keys.each { |k| | |
self << k | |
@extern[k][:scanned] = true | |
} | |
end | |
def scanExternHost(host, nesting="") | |
toScan = @extern.select { |k,v| v[:scanned] == false and v[:uri].host == host } | |
toScan.keys.each { |k| | |
self << k | |
@extern[k][:scanned] = true | |
} | |
openHost( host, nesting ) if nesting!="" | |
end | |
def showExtend | |
puts "___links_to_extend___" | |
internLinks = @intern.select { |k,v| v[:scanned] == true and v[:extend] == false } | |
internLinks.each { |k,v| | |
if @intern[k][:title] != "" then | |
puts "#{ @intern[k][:title] } :: #{ k }" | |
else | |
puts "\t:: #{ k }" | |
end | |
puts v[:links].join(", ") | |
puts | |
} | |
end | |
def showLinks | |
puts "___interne_links___" | |
internLinks = @intern.keys.sort | |
internLinks.each { |k| | |
if @intern[k][:title] != "" then | |
puts "#{ @intern[k][:title] } :: #{ k }" | |
else | |
puts "\t:: #{ k }" | |
end | |
} | |
puts | |
puts "___externe_links___" | |
puts "#{ @extern.keys.sort.join(", ") }" | |
puts | |
end | |
def showWords | |
words = getWords | |
wordCount = {} | |
words.each { |w| | |
unless wordCount.keys.include? w | |
wordCount[w] = 0 | |
end | |
wordCount[w] += 1 | |
} | |
sizes = wordCount.values.sort.reverse | |
wordSizes = {} | |
sizes.each { |s| | |
wordSizes[s] = [] | |
wordCount.each { |k,v| | |
if v == s then | |
wordSizes[s] << k | |
end | |
} | |
} | |
wordSizes.keys.sort.reverse.each { |k| | |
puts "\n#{ k } mal ::\n#{ wordSizes[k].join(", ") }\n" | |
} | |
end | |
def getWords | |
words = [] | |
@intern.each_value { |v| | |
if v[:scanned] == true then | |
v[:content].each { |c| | |
c.split(/[\s,\.\n\{\}\[\]\(\)\?\!;]+/mi).each { |w| | |
words << w unless w == "" | |
} | |
} | |
end | |
} | |
return words | |
end | |
def <<(link) | |
uri = URI(link) | |
unless( uri.host == @host ) | |
self.class.add link | |
return true | |
else | |
if @intern.keys.include? link | |
return true | |
end | |
end | |
@intern[link] = { | |
:uri => uri, | |
:scanned => false, | |
:extend => false, | |
:response => nil, | |
:title => "", | |
:links => [], | |
:content => [] | |
} | |
scan link | |
return true | |
end | |
def scan(link) | |
unless( @intern.keys.include? link ) | |
puts "Link wurde noch nicht hinzugefügt." | |
return false | |
end | |
toScan = @intern[link] | |
uri = toScan[:uri] | |
# testen ob filetyp ok ist. | |
if(uri.path =~ /\.[\w\d]+$/i) then | |
case uri.path | |
when /\.html$/i then nil | |
when /\.htm$/i then nil | |
when /\.txt$/i then nil | |
when /\.xml$/i then nil | |
when /\.rss$/i then nil | |
when /\.php$/i then nil | |
when /\.do$/i then nil | |
else toScan[:scanned] = true | |
end | |
end | |
if toScan[:scanned] then | |
return false | |
end | |
# Hole Inhalt des Links aus den WWW | |
begin | |
print "Hole Inhalt von \"#{ link } :: " | |
toScan[:response] = Net::HTTP.get_response(uri) | |
puts "OK" | |
toScan[:scanned] = true | |
rescue | |
puts "NO" | |
return false | |
end | |
contentsearch = toScan[:response].to_hash | |
contentsk = contentsearch.keys.select { |c| c =~ /content.type/i } | |
contentsk.each { |k| | |
unless contentsearch[k][0] =~ /text/i then | |
puts "Dieser Link enthält keinen Text: \"#{ link }\"" | |
return true | |
else | |
break | |
end | |
} | |
begin | |
# Suche Links im Body | |
print "Suche Links auf der Seite :: " | |
toScan[:links] = toScan[:response].body.scan(/https?:\/\/[^:\s"'<>#\(\)\[\]\{\},;]+/mi) | |
# scan für interne links | |
interneLinks = toScan[:response].body.scan(/href\s*=\s*['"]\/?[^\s:'"<>#\(\)\[\]\{\},;]+/im ) | |
interneLinks.each { |l| | |
if l =~ /['"]https?:\/\//mi or l =~ /['"]https?/ then | |
interneLinks.delete(l) | |
end | |
} | |
interneLinks.each { |il| | |
sl = il.gsub(/href\s*=\s*['"]/mi, "") | |
if sl =~ /^\// then | |
toScan[:links] << "http://#{ @host }#{ sl }" unless sl =~ /^http:\/\//i | |
else | |
toScan[:links] << "http://#{ @host }/#{ uri.path.gsub(/^\//i, "").gsub(/[^\/\."']+\.[^\/\."']+/i, "").gsub(/\/+$/, "") }\/#{ sl }" | |
end | |
} | |
llllist = toScan[:links].collect { |l| | |
b = l.gsub(/\/[^\/]+\/\.{2}\//i, "\/").gsub(/\/\.\//i, "\/").split(/:\/\//).collect { |d| d.gsub(/\/{2}/, "\/") }.join(":\/\/") | |
b | |
} | |
toScan[:links] = llllist.uniq | |
puts "OK" | |
rescue | |
puts "NO" | |
end | |
begin | |
# Parse Content | |
print "Suche Inhalt der Seite :: " | |
toScan[:content] = toScan[:response].body.gsub(/^.*<body/mi, "").gsub(" ", " ").gsub(/h1|h2|h3|h4|h5/mi, "|||").split("|||").collect { |c| c.gsub(/<[^<>]*>/mi, "").gsub(/[\s<>]+/mi, " ").strip } | |
puts "OK" | |
rescue | |
puts "NO" | |
end | |
begin | |
# Suche Titel | |
print "Suche Titel der Seite :: " | |
toScan[:title] = toScan[:response].body.scan(/<title>[^<>\/]+<\/title>/mi)[0].gsub(/<\/?title>/mi, "") | |
puts "OK" | |
rescue | |
puts "NO" | |
end | |
return true | |
end | |
end | |
puts "\n\n\n\twebAnalyzer - Herzlich Willkommen\n\n" | |
Website.console |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment