Skip to content

Instantly share code, notes, and snippets.

@femmerling
Created November 3, 2012 16:49
Show Gist options
  • Save femmerling/4007878 to your computer and use it in GitHub Desktop.
Save femmerling/4007878 to your computer and use it in GitHub Desktop.
Simple Crawler Using Python
"""
crawler.py
web link crawler
Nov 3rd 2012 saturday night insomnia coding session
Fauzan Erich Emmerling
[email protected]
"""
import re
from urllib2 import urlopen
links = []
root_url = 'http://www.google.com/' # sample root url only. Use any links you wish
def extract_links(url):
counter = 0 # use this to count the links found in url
anchor_pattern = '<a' # search for this to ensure that you are checking an anchor link element
href_pattern = 'href="http:' # search this for easier link extraction
print 'Crawl links in ' + url
try:
html_data = urlopen(url)
lines_list = html_data.readlines()
for line in lines_list:
anchor_element = re.search(anchor_pattern, line)
if anchor_element:
attributes = line.split(' ')
for attribute in attributes:
href_link = re.search(href_pattern, attribute)
if href_link:
link = attribute.split('"')
if len(links) == 0: # if the list is empty, append the component
links.append(link[1])
counter = counter + 1
else:
if not link[1] in links: # if the list is not empty, check if link already existed
links.append(link[1])
counter = counter + 1
print str(counter) + ' links found in ' + url
except:
print 'link inaccessible'
extract_links(root_url) # crawl base page
for link in links: # crawl all links other than base page
if link != root_url:
extract_links(link)
for link in links: # display all links found in the website
print link
print str(len(links)) + ' total links found in the website'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment