Skip to content

Instantly share code, notes, and snippets.

@addisaden
Created September 8, 2012 13:07
Show Gist options
  • Save addisaden/3674778 to your computer and use it in GitHub Desktop.
Save addisaden/3674778 to your computer and use it in GitHub Desktop.
Ein Skript, was ich vor nem Jahr als Prototyp entworfen habe um Webseiten zu analysieren
#!/usr/bin/env ruby
# encoding: utf-8
require "net/http"
require "readline"
class Website
############---------------------------------------------------------- CLASS
@@hosts ={} # "www.23g.eu" => WebsiteObj
@@sessionname = nil
def self.console
loop {
input = Readline.readline(">> ", true).strip
case input
when /^exit/i then break
when /^add\s+\S+$/i then add( input.split[1], true )
when /^list$/i then list
when /^ls$/i then list
when /^rename$/i then renameSession
when /^save$/i then saveSession
when /^load$/i then loadSession
when /^help$/i then helpConsole
when /^reset$/i then resetSession
when /^cd\s+\S+$/i then openHost( input.split[1] )
when /^remove\s+\S+$/i then removeHost (input.split[1])
when /^remove\s+r\s+\S+$/i then removeRegexHost (input.split[2])
else helpConsole
end
}
end
def self.removeRegexHost(host)
begin
toDelete = @@hosts.keys.sort.select { |s| s =~ Regexp.new(host, Regexp::MULTILINE+Regexp::IGNORECASE) }
raise NameError, "Keine übereinstimmenden Hosts gefunden." if toDelete.empty?
toDelete.each { |h|
@@hosts.delete(h)
}
rescue => msg
puts "Error :: #{msg}"
end
end
def self.removeHost(host)
begin
unless @@hosts.keys.include? host
raise NameError, "#{host} konnte nicht gefunden werden. Versuchen Sie remove r <regularexpression>."
end
@@hosts.delete(host)
rescue => msg
puts "Error :: #{msg}"
end
end
def self.openHost(hostkey)
if @@hosts.keys.include? hostkey then
@@hosts[hostkey].console
else
puts "Host #{hostkey} wurde nicht gefunden."
end
end
def self.resetSession
print "Sind Sie sicher, dass Sie die Session komplett zurücksetzen wollen? (ja/nein) "
unless( gets =~ /ja?/i )
puts "Zurücksetzen der Session abgebrochen."
return false
end
print "Wollen Sie die aktuelle Session vorher sichern? (ja/nein) "
if( gets =~ /ja?/i)
unless saveSession then
return false
end
end
@@hosts = {}
@@sessionname = nil
puts "Session wurde erfolgreich zurückgesetzt."
return true
end
def self.helpConsole
puts "\n__Hilfe__"
puts
puts " Befehlsübersicht:"
puts "add <link> - Fügt einen Link in die aktuelle Datenbank"
puts "list - Listet die aktuellen Hosts auf"
puts "ls - Synonym für list"
puts "cd <host> - Open <host>"
puts "remove <host> - Löscht den angegebenen Host"
puts "remove r <regularexpression> - Löscht die gefundenen Hosts (RegExp)"
puts "rename - Umbennenung der Session"
puts "save - Sichern der Session"
puts "load - Laden einer Session"
puts "help - Gibt diesen Text aus"
puts "exit - Beendet die Console"
end
def self.renameSession
puts "Aktueller Sessionname: #{ @@sessionname || 'Noch kein Sessionname vorhanden.'}"
print "Sessionname: "
while ((@@sessionname = gets.strip) =~ /[^\w\d]+/i)
print "Ungültiger Name, bitte nochmal eingeben: "
end
end
def self.saveSession
unless @@sessionname
renameSession
end
if(File.exist? "#{ @@sessionname }.session")
print "Session vorhanden. Soll diese überschrieben werden? (ja/nein) "
unless( gets =~ /ja?/i )
puts "Speicherung der Session abgebrochen."
return false
end
end
begin
saveInhalt = Marshal::dump(@@hosts)
File.open("#{ @@sessionname }.session", "w") { |f|
f.puts saveInhalt
}
puts "Speicherung der Session erfolgreich."
rescue => msg
puts "Etwas ist bei der Speicherung schief gelaufen.\nERR: #{msg} \n"
return false
end
return true
end
def self.loadSession
# test if current session not empty
unless(@@hosts == {})
print "Aktuelle Session ist nicht leer, soll diese überschrieben werden? (ja/nein) "
unless( gets =~ /ja?/i )
puts "Laden der Session abgebrochen."
return false
end
end
# list all saved sessions
sessions = Dir["*.session"].collect { |s| s.gsub(/\.session/i, "") }
puts sessions.join(", ")
# choose a session
print "Sessionnamen wählen: "
sname = gets.strip
unless(File.exist? "#{ sname }.session")
puts "Session nicht vorhanden."
return false
else
# load session
@@hosts = Marshal::load( File.read( "#{sname}.session" ) )
@@sessionname = sname
puts "Laden der Session erfolgreich."
end
return true
end
def self.list
k = @@hosts.keys
puts k.sort.join(", ")
end
def self.add(link,mainloop=false)
unless link =~ /^https?:\/\//i
link = "http://#{ link }"
end
uri = URI(link)
print "Search for host \"#{ uri.host }\" :: "
if( @@hosts.keys.include? uri.host ) then
puts "OK"
# add link to host
print "Add link to \"#{ uri.host }\" :: "
@@hosts[uri.host] << link
puts "OK"
else
puts "Not found"
print "Save new Host \"#{ uri.host }\" :: "
w = Website.new uri.host
puts "OK"
puts "Added link to \"#{ uri.host }\" :: OK"
w << link
end
@@hosts[uri.host].console if mainloop
end
############---------------------------------------------------------- Object
def initialize(host)
@host = host
@intern = {}
@extern = {}
@@hosts[@host] = self
end
def console(nesting="")
nesting += " > " if nesting != ""
nesting += "#{ @host }"
loop {
input = Readline.readline("(#{ nesting }) >> ", true).strip
case input
when /^cd\s+\.\./i then break
when /^help$/i then helpConsole
when /^scan\s+extern\s\S+$/i then scanExternHost(input.split[2], nesting)
when /^scan\s+extern$/i then scanExtern
when /^scan$/i then scanIntern
when /^scan\s+complete$/i then scanCompleteIntern
when /^scan\s+all$/i then scanAll
when /^extend$/i then extendIntern
when /^extend\s+\S+$/i then extendIntern(input.split[1])
when /^extend\s+\S+\s+\S+$/i then extendIntern(input.split[1], input.split[2])
when /^show\s+extend$/i then showExtend
when /^show\s+links?$/i then showLinks
when /^show\s+words?$/i then showWords
when /^show\s+source$/i then showPageSource
when /^show$/i then statusIntern
when /^ls$/i then statusIntern
when /^cd\s+\S+$/i then openHost( input.split[1], nesting )
when /^search\s+\S+$/i then searchContent(input.split[1])
when /^body\s+\S+$/i then searchBody(input.split[1])
when /^bodylinks$/i then bodylinks
when /^add\s+intern\s+\S+$/i then addInternLink(input.split[2])
when /^add\s+\S+$/i then self << (input.split[1])
else helpConsole
end
}
end
def helpConsole
puts "\n__Hilfe__Host__"
puts
puts " Befehlsübersicht:"
puts "search <regexp> - search with regex"
puts "body <regexp> - search bodys with regex"
puts "bodylinks - makro zum auflisten aller href-Links"
puts "cd .. - Geht wieder zum Hauptmenu"
puts "cd <host> - Öffne Host"
puts "show - Zeigt status des Hosts"
puts "ls - Synonym für show"
puts "show extend - Zeigt alle Links die mit extend aktiviert werden können"
puts "show source - Seite auswählen und Quelltext anzeigen lassen"
puts "show links - Zeigt die Links an"
puts "show words - Zählt die Wörter und bildet ein Ranking"
puts "scan - Scannt interne Links"
puts "scan extern - Scannt externe Links"
puts "scan extern <host> - Scannt externe Links von angegebenen Host"
puts "scan complete - Scannt den kompletten internen Bereich"
puts "scan all - Scannt interne und externe links"
puts "extend - Pusht Links von gescannten Links in die Datenbank"
puts "extend <f> - (regexp) wie extend, f => filter für links die extended werden sollen"
puts "extend <f> <df> - (regexp) wie extend, f => wie oben, df => filter für links die extended werden"
puts "add <link> - Einen Link hinzufügen"
puts "add intern <link> - Einen internen Link hinzufügen -> Bsp. http://xyz.net/new : add intern new"
# puts "cd <host> - Open <host>"
puts "help - Gibt diesen Text aus"
end
def showPageSource
k = (@intern.select { |k,v| v[:scanned] == true}).keys
k.each_index { |i|
puts "#{ i+1 }\t-> #{ k[i].strip }"
}
print "Bitte wählen Sie den Link aus: "
auswahl = gets.to_i - 1
begin
s = k[auswahl]
puts "#{ s } :: SOURCE"
sou = @intern[s][:response].body
soup = sou.split(/\r?\n/)
(soup.length - 1).times { |i|
if soup[i] =~ /\S/i then
case soup[i]
when /<body[\s>]/i then puts "\n\n___BODY_______________________________\n\n"
when /<html[\s>]/i then puts "\n\n___HTML_______________________________\n\n"
when /<head[\s>]/i then puts "\n\n___HEAD_______________________________\n\n"
when /<title[\s>]/i then puts "\n\n___TITLE______#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
when /<h1[\s>]/i then puts "\n\n___HEADER_1___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
when /<h2[\s>]/i then puts "\n\n___HEADER_2___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
when /<h3[\s>]/i then puts "\n\n___HEADER_3___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
when /<h4[\s>]/i then puts "\n\n___HEADER_4___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
when /<h5[\s>]/i then puts "\n\n___HEADER_5___#{ soup[i].gsub(/<[^<>]*>/i, "").strip }________________________\n\n"
end
puts "#{ i }\t|#{ soup[i].strip }"
end
}
puts
rescue => msg
puts "\nError :: #{ msg }"
end
end
def scanAll
scanIntern
scanExtern
end
def scanCompleteIntern
xyz = 0
extendIntern
while (@intern.select { |k,v| v[:scanned] == true}).keys.length < @intern.keys.length or (@intern.select { |k,v| v[:extend] == true}).keys.length < @intern.keys.length do
extendIntern
scanIntern
if xyz % 2 == 1 then
print "Wollen Sie noch weiter scannen? (ja/nein) "
unless( gets =~ /ja?/i )
puts "Komplettes scannen abgebrochen."
break
end
end
xyz += 1
end
end
def bodylinks
searchBody "href\\s*=\\s*\"[^\"]+\""
end
def addInternLink(link)
self << "http://#{ @host }/#{ link }"
end
def searchBody(regex)
begin
r = Regexp.new(regex, Regexp::MULTILINE+Regexp::IGNORECASE)
allBody = {}
scanned = @intern.select { |k,v| v[:scanned] == true }
scanned.each { |k,v|
if v[:response] then
v[:response].body.scan(r).each { |c|
allBody[k] = (allBody[k] || []) + [c.gsub(/\r?\n/m, " ")]
}
end
}
allBody.each { |k,v|
puts "#{ k }:\n-> #{ v.join("\n-> ") }\n\n\n"
}
rescue => msg
puts "Error :: #{ msg }"
end
end
def searchContent(regex)
begin
r = Regexp.new(regex, Regexp::MULTILINE+Regexp::IGNORECASE)
allContent = {}
scanned = @intern.select { |k,v| v[:scanned] == true }
scanned.each { |k,v|
v[:content].each { |c|
if ( c =~ r ) then
allContent[k] = (allContent[k] || []) + [c.gsub(/\r?\n/m, " ")]
end
}
}
puts "\n\n\n***************************************\n\n\n"
allContent.each { |k,v|
puts "#{ k }:\n-> #{ v.join("\n\n-> ") }\n\n\n"
}
rescue => msg
puts "Error :: #{ msg }"
end
return true
end
def openHost(hostkey, nesting="")
if @@hosts.keys.include? hostkey then
@@hosts[hostkey].console(nesting)
else
puts "Host \"#{hostkey}\" unbekannt. Evtl sollten Sie den Befehl \"scan extern\" verwenden."
end
end
def statusIntern
puts
puts "Hostname | #{ @host }"
puts "Content | #{ (@intern.select { |k,v| v[:scanned] == true and v[:content].length > 0}).values.collect {|v| v[:content].length }.inject {|x,y| x+y} }"
puts "interne gescannte Links | #{(@intern.select { |k,v| v[:scanned] == true}).keys.length }/#{ @intern.keys.length }"
puts "interne extended Links | #{(@intern.select { |k,v| v[:extend] == true}).keys.length }/#{ @intern.keys.length }"
puts "externe gescannte Links | #{(@extern.select { |k,v| v[:scanned] == true}).keys.length }/#{ @extern.keys.length }"
puts "externe Hosts | #{ (@extern.keys.collect { |k| URI(k).host }).uniq.sort.join(", ") }"
end
def scanIntern
toScan = @intern.select { |k,v| v[:scanned] == false }
toScan.keys.each { |k|
scan(k)
}
end
def extendIntern(filter=nil, linkfilter=nil)
scanned = {}
if(filter==nil)
scanned = @intern.select { |k,v| v[:scanned] == true }
else
begin
r = Regexp.new(filter, Regexp::MULTILINE+Regexp::IGNORECASE)
scanned = @intern.select { |k,v| v[:scanned] == true and k =~ r }
rescue => msg
puts "Error :: #{ msg }"
return false
end
end
toExtend = []
scanned.each { |k,v|
v[:links].each { |l|
unless(linkfilter == nil)
begin
r = Regexp.new(linkfilter, Regexp::MULTILINE+Regexp::IGNORECASE)
if( l =~ r)
toExtend << l
end
rescue => msg
puts "Error :: #{ msg }"
toExtend << l
end
else
toExtend << l
end
}
v[:extend] = true if filter==nil
}
toExtend.each { |l|
unless(@intern.keys.include? l or @extern.keys.include? l)
begin
uri = URI(l)
linkhost = uri.host
if linkhost == @host then
@intern[l] = {
:uri => uri,
:scanned => false,
:extend => false,
:response => nil,
:title => "",
:links => [],
:content => []
}
else
@extern[l] = {
:uri => uri,
:scanned => false
}
end
rescue
@intern.delete(l)
@extern.delete(l)
puts "Fehlerhafter Link \"#{ l }\" wurde gelöscht."
end
end
}
end
def scanExtern
toScan = @extern.select { |k,v| v[:scanned] == false }
toScan.keys.each { |k|
self << k
@extern[k][:scanned] = true
}
end
def scanExternHost(host, nesting="")
toScan = @extern.select { |k,v| v[:scanned] == false and v[:uri].host == host }
toScan.keys.each { |k|
self << k
@extern[k][:scanned] = true
}
openHost( host, nesting ) if nesting!=""
end
def showExtend
puts "___links_to_extend___"
internLinks = @intern.select { |k,v| v[:scanned] == true and v[:extend] == false }
internLinks.each { |k,v|
if @intern[k][:title] != "" then
puts "#{ @intern[k][:title] } :: #{ k }"
else
puts "\t:: #{ k }"
end
puts v[:links].join(", ")
puts
}
end
def showLinks
puts "___interne_links___"
internLinks = @intern.keys.sort
internLinks.each { |k|
if @intern[k][:title] != "" then
puts "#{ @intern[k][:title] } :: #{ k }"
else
puts "\t:: #{ k }"
end
}
puts
puts "___externe_links___"
puts "#{ @extern.keys.sort.join(", ") }"
puts
end
def showWords
words = getWords
wordCount = {}
words.each { |w|
unless wordCount.keys.include? w
wordCount[w] = 0
end
wordCount[w] += 1
}
sizes = wordCount.values.sort.reverse
wordSizes = {}
sizes.each { |s|
wordSizes[s] = []
wordCount.each { |k,v|
if v == s then
wordSizes[s] << k
end
}
}
wordSizes.keys.sort.reverse.each { |k|
puts "\n#{ k } mal ::\n#{ wordSizes[k].join(", ") }\n"
}
end
def getWords
words = []
@intern.each_value { |v|
if v[:scanned] == true then
v[:content].each { |c|
c.split(/[\s,\.\n\{\}\[\]\(\)\?\!;]+/mi).each { |w|
words << w unless w == ""
}
}
end
}
return words
end
def <<(link)
uri = URI(link)
unless( uri.host == @host )
self.class.add link
return true
else
if @intern.keys.include? link
return true
end
end
@intern[link] = {
:uri => uri,
:scanned => false,
:extend => false,
:response => nil,
:title => "",
:links => [],
:content => []
}
scan link
return true
end
def scan(link)
unless( @intern.keys.include? link )
puts "Link wurde noch nicht hinzugefügt."
return false
end
toScan = @intern[link]
uri = toScan[:uri]
# testen ob filetyp ok ist.
if(uri.path =~ /\.[\w\d]+$/i) then
case uri.path
when /\.html$/i then nil
when /\.htm$/i then nil
when /\.txt$/i then nil
when /\.xml$/i then nil
when /\.rss$/i then nil
when /\.php$/i then nil
when /\.do$/i then nil
else toScan[:scanned] = true
end
end
if toScan[:scanned] then
return false
end
# Hole Inhalt des Links aus den WWW
begin
print "Hole Inhalt von \"#{ link } :: "
toScan[:response] = Net::HTTP.get_response(uri)
puts "OK"
toScan[:scanned] = true
rescue
puts "NO"
return false
end
contentsearch = toScan[:response].to_hash
contentsk = contentsearch.keys.select { |c| c =~ /content.type/i }
contentsk.each { |k|
unless contentsearch[k][0] =~ /text/i then
puts "Dieser Link enthält keinen Text: \"#{ link }\""
return true
else
break
end
}
begin
# Suche Links im Body
print "Suche Links auf der Seite :: "
toScan[:links] = toScan[:response].body.scan(/https?:\/\/[^:\s"'<>#\(\)\[\]\{\},;]+/mi)
# scan für interne links
interneLinks = toScan[:response].body.scan(/href\s*=\s*['"]\/?[^\s:'"<>#\(\)\[\]\{\},;]+/im )
interneLinks.each { |l|
if l =~ /['"]https?:\/\//mi or l =~ /['"]https?/ then
interneLinks.delete(l)
end
}
interneLinks.each { |il|
sl = il.gsub(/href\s*=\s*['"]/mi, "")
if sl =~ /^\// then
toScan[:links] << "http://#{ @host }#{ sl }" unless sl =~ /^http:\/\//i
else
toScan[:links] << "http://#{ @host }/#{ uri.path.gsub(/^\//i, "").gsub(/[^\/\."']+\.[^\/\."']+/i, "").gsub(/\/+$/, "") }\/#{ sl }"
end
}
llllist = toScan[:links].collect { |l|
b = l.gsub(/\/[^\/]+\/\.{2}\//i, "\/").gsub(/\/\.\//i, "\/").split(/:\/\//).collect { |d| d.gsub(/\/{2}/, "\/") }.join(":\/\/")
b
}
toScan[:links] = llllist.uniq
puts "OK"
rescue
puts "NO"
end
begin
# Parse Content
print "Suche Inhalt der Seite :: "
toScan[:content] = toScan[:response].body.gsub(/^.*<body/mi, "").gsub("&nbsp;", " ").gsub(/h1|h2|h3|h4|h5/mi, "|||").split("|||").collect { |c| c.gsub(/<[^<>]*>/mi, "").gsub(/[\s<>]+/mi, " ").strip }
puts "OK"
rescue
puts "NO"
end
begin
# Suche Titel
print "Suche Titel der Seite :: "
toScan[:title] = toScan[:response].body.scan(/<title>[^<>\/]+<\/title>/mi)[0].gsub(/<\/?title>/mi, "")
puts "OK"
rescue
puts "NO"
end
return true
end
end
puts "\n\n\n\twebAnalyzer - Herzlich Willkommen\n\n"
Website.console
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment