Skip to content

Instantly share code, notes, and snippets.

@kanrourou
Created December 31, 2018 23:12
Show Gist options
  • Save kanrourou/881c9eb7255d6af8061d0b88f9ea6e48 to your computer and use it in GitHub Desktop.
Save kanrourou/881c9eb7255d6af8061d0b88f9ea6e48 to your computer and use it in GitHub Desktop.
import urllib2
def get_all_links(page):
#extract all links from the html page
def get_page(link):
#get the html page
def addTasks(tasks, newTasks):
for task in newTasks:
tasks.append(task)
def crawl(seed)
{
tasks = [seed]
visited = []
while(tasks):
link = tasks.pop();
if link not in visited:
page = get_page(link)
links = get_all_links(page)
addTasks(tasks, links)
visited.append(link)
#do something with the page...
#maybe store in DB
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment