sebgoa · October 6, 2023 13:25
diff --git a/crawler.py b/crawler.py
 #!/usr/bin/env python

 import subprocess
 import urlparse
 import urllib2
 from urllib import urlencode
 import socket
 import Queue
 import threading
 import getopt
 import sys
 import os

 '''Adding a comment for a test'''

 '''Non-standard modules for html parsing and graph creation'''
 import networkx as nx
 import BeautifulSoup

 '''Needed to plot the graph, otherwise skip'''
 import matplotlib
 matplotlib.use('TKAgg')
 import matplotlib.pyplot as plt

 max_threads=50
 next_url=Queue.Queue()
 crawled_urls=[]

 def check_link(url):
  '''Function to test that the url is in the clemson.edu domain and that it is not a pdf file'''
 	domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:])
 	filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]

 	if  (domain == 'clemson.edu' and filetype != 'pdf'):
 		return True
 	else:
 		return False

 def get_host(url):
 	'''Function to get the IP adress of the host serving the page'''
 	return socket.gethostbyname(urlparse.urlparse(url).netloc)

 def get_links_from_page(url):
 	'''Function to extract a list of urls from a page
 	Uses a flag to choose between html parsers...more can be implemented
 	'''
 	global parser_flag

 	if parser_flag == 'lynx':
 		res=subprocess.Popen('lynx -dump ' + url + '| grep http | awk \'{print $2}\' \
 			| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
 		(pstdout, pstderr) = res.communicate()   
 		urllist = pstdout.split("\n")
        
 	elif parser_flag == 'beautifulsoup':
 		urllist = []
 		try:
 			'''This may fail due to unicode issues, needs to be checked'''
 			res=urllib2.urlopen(url)
 			htmlpage=res.read()
 		except:
 			return urllist

 		try:
 			page=BeautifulSoup.BeautifulSoup(htmlpage)
 		except:
 			return urllist
 	
 		refs=page.findAll("a")
 		for a in refs:
 			try:
 				link = a['href']
 				if link[:4] == 'http':
 					urllist.append(link)
 			except:
 				pass
 	else:
 		print "Do not know how to parse the html !!! Specify a parser_flag"

 	return urllist

 def find_links(url_tuple,graph):
 	'''Crawls to a given depth using a tuple structure to tag urls with their depth'''
 	global crawled_urls,next_url,max_depth

 	print url_tuple
 	print len(crawled_urls)
 	url = url_tuple[0]
 	depth = url_tuple[1]
 	
 	if ( depth < max_depth and check_link(url) == True) :
  
        	links = get_links_from_page(url)

 		for link in links:
 			'''These two lines create the graph'''
 			graph.add_node(link)
 			graph.add_edge(url,link)
 			'''If the link has not been crawled yet, add it in the queue with additional depth'''
 			if link not in crawled_urls:
 				next_url.put((link,depth+1))
 				crawled_urls.append(link)
 	else:
 		pass
 	return 

 class crawler_thread(threading.Thread):
 	'''Consumer thread that gets a url from the queue and find the links in that page url'''
 	def __init__(self,queue,graph):
 		threading.Thread.__init__(self)
 		self.to_be_crawled=queue
 		self.graph=graph
 	def run(self):
 		while self.to_be_crawled.empty() is False:
 			find_links(self.to_be_crawled.get(),self.graph)

 def draw_graph(graph,graph_file_name):
 	'''Function to draw the graph and save the files'''
 	nx.draw(graph,with_labels=False)
 	nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
 	plt.savefig(os.cwd()+graph_file_name+'.png')

 def usage():
 	'''Usage function, prints the usage to stdout'''
 	print '-r specifies the root url \
 		-d specifies the depth \
 		-p specifies the parser'

 def main():
 	'''Initiates the queue by putting the root url in it
 	Then iterates until the queue is empty	
 	A simple threaded version starts crawler_thread to empty the queue
 	Speed up seems limited and suspicious :), to be checked
 	'''

 	next_url.put((root_url,0))
 	crawled_urls.append(root_url)
 	ip_list=[]
 	g=nx.Graph()
 	g.add_node(root_url)
 	thread_list=[]

 	for i in range(max_threads):
 		t=crawler_thread(next_url,g)
 		t.daemon=True
 		t.start()
 		thread_list.append(t)

 	for t in thread_list:
 		t.join()

 	for url in crawled_urls:
 		ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc))
 		ip_list=list(set(ip_list))
 	
 	print "Unique Host: %s " % len(ip_list)

 	fh=open(os.getcwd()+'/targets.list','w')
 	for ip in ip_list:
 		fh.write(str(ip)+'\n')

 	nodesize=[g.degree(n)*10 for n in g]
 	pos=nx.spring_layout(g,iterations=20)
 	#pos=nx.graphviz_layout(g,prog='neato')
 	#pos=nx.spectral_layout(g)
 	nx.draw(g,with_labels=False)
 	nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
 	nx.draw_networkx_edges(g,pos)
 	plt.show()
 	plt.savefig("/Users/runseb/Desktop/crawl.png")
 	nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot")
 	

 if __name__=='__main__':
 	
 	try:
 		options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:')
 	except getopt.GetoptError, err:
 		print str(err)
 	        usage()
        	sys.exit(2)

 	'''Set defaults'''
 	root_url='http://www.clemson.edu'
 	parser_flag = 'beautifulsoup'
 	max_depth=2

 	for opt, arg in options:
 		if opt == '-r':
 			root_url = arg
 		elif opt == '-p':
 			parser_flag = arg
 		elif opt == '-d':
 			max_depth == arg
 		else:
 			usage()
 			sys.exit(2)	

 	sys.exit(main())
	#!/usr/bin/env python

	import subprocess
	import urlparse
	import urllib2
	from urllib import urlencode
	import socket
	import Queue
	import threading
	import getopt
	import sys
	import os

	'''Adding a comment for a test'''

	'''Non-standard modules for html parsing and graph creation'''
	import networkx as nx
	import BeautifulSoup

	'''Needed to plot the graph, otherwise skip'''
	import matplotlib
	matplotlib.use('TKAgg')
	import matplotlib.pyplot as plt

	max_threads=50
	next_url=Queue.Queue()
	crawled_urls=[]

	def check_link(url):
	'''Function to test that the url is in the clemson.edu domain and that it is not a pdf file'''
	domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:])
	filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]

	if (domain == 'clemson.edu' and filetype != 'pdf'):
	return True
	else:
	return False

	def get_host(url):
	'''Function to get the IP adress of the host serving the page'''
	return socket.gethostbyname(urlparse.urlparse(url).netloc)

	def get_links_from_page(url):
	'''Function to extract a list of urls from a page
	Uses a flag to choose between html parsers...more can be implemented
	'''
	global parser_flag

	if parser_flag == 'lynx':
	res=subprocess.Popen('lynx -dump ' + url + '\| grep http \| awk \'{print $2}\' \
	\| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
	(pstdout, pstderr) = res.communicate()
	urllist = pstdout.split("\n")

	elif parser_flag == 'beautifulsoup':
	urllist = []
	try:
	'''This may fail due to unicode issues, needs to be checked'''
	res=urllib2.urlopen(url)
	htmlpage=res.read()
	except:
	return urllist

	try:
	page=BeautifulSoup.BeautifulSoup(htmlpage)
	except:
	return urllist

	refs=page.findAll("a")
	for a in refs:
	try:
	link = a['href']
	if link[:4] == 'http':
	urllist.append(link)
	except:
	pass
	else:
	print "Do not know how to parse the html !!! Specify a parser_flag"

	return urllist

	def find_links(url_tuple,graph):
	'''Crawls to a given depth using a tuple structure to tag urls with their depth'''
	global crawled_urls,next_url,max_depth

	print url_tuple
	print len(crawled_urls)
	url = url_tuple[0]
	depth = url_tuple[1]

	if ( depth < max_depth and check_link(url) == True) :

	links = get_links_from_page(url)

	for link in links:
	'''These two lines create the graph'''
	graph.add_node(link)
	graph.add_edge(url,link)
	'''If the link has not been crawled yet, add it in the queue with additional depth'''
	if link not in crawled_urls:
	next_url.put((link,depth+1))
	crawled_urls.append(link)
	else:
	pass
	return

	class crawler_thread(threading.Thread):
	'''Consumer thread that gets a url from the queue and find the links in that page url'''
	def __init__(self,queue,graph):
	threading.Thread.__init__(self)
	self.to_be_crawled=queue
	self.graph=graph
	def run(self):
	while self.to_be_crawled.empty() is False:
	find_links(self.to_be_crawled.get(),self.graph)

	def draw_graph(graph,graph_file_name):
	'''Function to draw the graph and save the files'''
	nx.draw(graph,with_labels=False)
	nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
	plt.savefig(os.cwd()+graph_file_name+'.png')

	def usage():
	'''Usage function, prints the usage to stdout'''
	print '-r specifies the root url \
	-d specifies the depth \
	-p specifies the parser'

	def main():
	'''Initiates the queue by putting the root url in it
	Then iterates until the queue is empty
	A simple threaded version starts crawler_thread to empty the queue
	Speed up seems limited and suspicious :), to be checked
	'''

	next_url.put((root_url,0))
	crawled_urls.append(root_url)
	ip_list=[]
	g=nx.Graph()
	g.add_node(root_url)
	thread_list=[]

	for i in range(max_threads):
	t=crawler_thread(next_url,g)
	t.daemon=True
	t.start()
	thread_list.append(t)

	for t in thread_list:
	t.join()

	for url in crawled_urls:
	ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc))
	ip_list=list(set(ip_list))

	print "Unique Host: %s " % len(ip_list)

	fh=open(os.getcwd()+'/targets.list','w')
	for ip in ip_list:
	fh.write(str(ip)+'\n')

	nodesize=[g.degree(n)*10 for n in g]
	pos=nx.spring_layout(g,iterations=20)
	#pos=nx.graphviz_layout(g,prog='neato')
	#pos=nx.spectral_layout(g)
	nx.draw(g,with_labels=False)
	nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
	nx.draw_networkx_edges(g,pos)
	plt.show()
	plt.savefig("/Users/runseb/Desktop/crawl.png")
	nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot")


	if __name__=='__main__':

	try:
	options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:')
	except getopt.GetoptError, err:
	print str(err)
	usage()
	sys.exit(2)

	'''Set defaults'''
	root_url='http://www.clemson.edu'
	parser_flag = 'beautifulsoup'
	max_depth=2

	for opt, arg in options:
	if opt == '-r':
	root_url = arg
	elif opt == '-p':
	parser_flag = arg
	elif opt == '-d':
	max_depth == arg
	else:
	usage()
	sys.exit(2)

	sys.exit(main())