kanrourou · December 31, 2018 23:12
diff --git a/really_simple_web_crawler.py b/really_simple_web_crawler.py
 import urllib2

 def get_all_links(page):
  #extract all links from the html page
  
 def get_page(link):
  #get the html page
  

 def addTasks(tasks, newTasks):
  for task in newTasks:
    tasks.append(task)


 def crawl(seed)
 {
  tasks = [seed]
  visited = []
  while(tasks):
    link = tasks.pop();
    if link not in visited:
      page = get_page(link)
      links = get_all_links(page)
      addTasks(tasks, links)
      visited.append(link)
      #do something with the page...
      #maybe store in DB
 }
	import urllib2

	def get_all_links(page):
	#extract all links from the html page

	def get_page(link):
	#get the html page


	def addTasks(tasks, newTasks):
	for task in newTasks:
	tasks.append(task)


	def crawl(seed)
	{
	tasks = [seed]
	visited = []
	while(tasks):
	link = tasks.pop();
	if link not in visited:
	page = get_page(link)
	links = get_all_links(page)
	addTasks(tasks, links)
	visited.append(link)
	#do something with the page...
	#maybe store in DB
	}