Last active
October 6, 2023 13:25
-
-
Save sebgoa/5027212 to your computer and use it in GitHub Desktop.
A web crawler and graph builder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import subprocess | |
import urlparse | |
import urllib2 | |
from urllib import urlencode | |
import socket | |
import Queue | |
import threading | |
import getopt | |
import sys | |
import os | |
'''Adding a comment for a test''' | |
'''Non-standard modules for html parsing and graph creation''' | |
import networkx as nx | |
import BeautifulSoup | |
'''Needed to plot the graph, otherwise skip''' | |
import matplotlib | |
matplotlib.use('TKAgg') | |
import matplotlib.pyplot as plt | |
max_threads=50 | |
next_url=Queue.Queue() | |
crawled_urls=[] | |
def check_link(url): | |
'''Function to test that the url is in the clemson.edu domain and that it is not a pdf file''' | |
domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:]) | |
filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0] | |
if (domain == 'clemson.edu' and filetype != 'pdf'): | |
return True | |
else: | |
return False | |
def get_host(url): | |
'''Function to get the IP adress of the host serving the page''' | |
return socket.gethostbyname(urlparse.urlparse(url).netloc) | |
def get_links_from_page(url): | |
'''Function to extract a list of urls from a page | |
Uses a flag to choose between html parsers...more can be implemented | |
''' | |
global parser_flag | |
if parser_flag == 'lynx': | |
res=subprocess.Popen('lynx -dump ' + url + '| grep http | awk \'{print $2}\' \ | |
| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
(pstdout, pstderr) = res.communicate() | |
urllist = pstdout.split("\n") | |
elif parser_flag == 'beautifulsoup': | |
urllist = [] | |
try: | |
'''This may fail due to unicode issues, needs to be checked''' | |
res=urllib2.urlopen(url) | |
htmlpage=res.read() | |
except: | |
return urllist | |
try: | |
page=BeautifulSoup.BeautifulSoup(htmlpage) | |
except: | |
return urllist | |
refs=page.findAll("a") | |
for a in refs: | |
try: | |
link = a['href'] | |
if link[:4] == 'http': | |
urllist.append(link) | |
except: | |
pass | |
else: | |
print "Do not know how to parse the html !!! Specify a parser_flag" | |
return urllist | |
def find_links(url_tuple,graph): | |
'''Crawls to a given depth using a tuple structure to tag urls with their depth''' | |
global crawled_urls,next_url,max_depth | |
print url_tuple | |
print len(crawled_urls) | |
url = url_tuple[0] | |
depth = url_tuple[1] | |
if ( depth < max_depth and check_link(url) == True) : | |
links = get_links_from_page(url) | |
for link in links: | |
'''These two lines create the graph''' | |
graph.add_node(link) | |
graph.add_edge(url,link) | |
'''If the link has not been crawled yet, add it in the queue with additional depth''' | |
if link not in crawled_urls: | |
next_url.put((link,depth+1)) | |
crawled_urls.append(link) | |
else: | |
pass | |
return | |
class crawler_thread(threading.Thread): | |
'''Consumer thread that gets a url from the queue and find the links in that page url''' | |
def __init__(self,queue,graph): | |
threading.Thread.__init__(self) | |
self.to_be_crawled=queue | |
self.graph=graph | |
def run(self): | |
while self.to_be_crawled.empty() is False: | |
find_links(self.to_be_crawled.get(),self.graph) | |
def draw_graph(graph,graph_file_name): | |
'''Function to draw the graph and save the files''' | |
nx.draw(graph,with_labels=False) | |
nx.write_dot(graph,os.cwd()+graph_file_name+'.dot') | |
plt.savefig(os.cwd()+graph_file_name+'.png') | |
def usage(): | |
'''Usage function, prints the usage to stdout''' | |
print '-r specifies the root url \ | |
-d specifies the depth \ | |
-p specifies the parser' | |
def main(): | |
'''Initiates the queue by putting the root url in it | |
Then iterates until the queue is empty | |
A simple threaded version starts crawler_thread to empty the queue | |
Speed up seems limited and suspicious :), to be checked | |
''' | |
next_url.put((root_url,0)) | |
crawled_urls.append(root_url) | |
ip_list=[] | |
g=nx.Graph() | |
g.add_node(root_url) | |
thread_list=[] | |
for i in range(max_threads): | |
t=crawler_thread(next_url,g) | |
t.daemon=True | |
t.start() | |
thread_list.append(t) | |
for t in thread_list: | |
t.join() | |
for url in crawled_urls: | |
ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc)) | |
ip_list=list(set(ip_list)) | |
print "Unique Host: %s " % len(ip_list) | |
fh=open(os.getcwd()+'/targets.list','w') | |
for ip in ip_list: | |
fh.write(str(ip)+'\n') | |
nodesize=[g.degree(n)*10 for n in g] | |
pos=nx.spring_layout(g,iterations=20) | |
#pos=nx.graphviz_layout(g,prog='neato') | |
#pos=nx.spectral_layout(g) | |
nx.draw(g,with_labels=False) | |
nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r') | |
nx.draw_networkx_edges(g,pos) | |
plt.show() | |
plt.savefig("/Users/runseb/Desktop/crawl.png") | |
nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot") | |
if __name__=='__main__': | |
try: | |
options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:') | |
except getopt.GetoptError, err: | |
print str(err) | |
usage() | |
sys.exit(2) | |
'''Set defaults''' | |
root_url='http://www.clemson.edu' | |
parser_flag = 'beautifulsoup' | |
max_depth=2 | |
for opt, arg in options: | |
if opt == '-r': | |
root_url = arg | |
elif opt == '-p': | |
parser_flag = arg | |
elif opt == '-d': | |
max_depth == arg | |
else: | |
usage() | |
sys.exit(2) | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment