Created
November 3, 2012 16:49
-
-
Save femmerling/4007878 to your computer and use it in GitHub Desktop.
Simple Crawler Using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
crawler.py | |
web link crawler | |
Nov 3rd 2012 saturday night insomnia coding session | |
Fauzan Erich Emmerling | |
[email protected] | |
""" | |
import re | |
from urllib2 import urlopen | |
links = [] | |
root_url = 'http://www.google.com/' # sample root url only. Use any links you wish | |
def extract_links(url): | |
counter = 0 # use this to count the links found in url | |
anchor_pattern = '<a' # search for this to ensure that you are checking an anchor link element | |
href_pattern = 'href="http:' # search this for easier link extraction | |
print 'Crawl links in ' + url | |
try: | |
html_data = urlopen(url) | |
lines_list = html_data.readlines() | |
for line in lines_list: | |
anchor_element = re.search(anchor_pattern, line) | |
if anchor_element: | |
attributes = line.split(' ') | |
for attribute in attributes: | |
href_link = re.search(href_pattern, attribute) | |
if href_link: | |
link = attribute.split('"') | |
if len(links) == 0: # if the list is empty, append the component | |
links.append(link[1]) | |
counter = counter + 1 | |
else: | |
if not link[1] in links: # if the list is not empty, check if link already existed | |
links.append(link[1]) | |
counter = counter + 1 | |
print str(counter) + ' links found in ' + url | |
except: | |
print 'link inaccessible' | |
extract_links(root_url) # crawl base page | |
for link in links: # crawl all links other than base page | |
if link != root_url: | |
extract_links(link) | |
for link in links: # display all links found in the website | |
print link | |
print str(len(links)) + ' total links found in the website' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment