Skip to content

Instantly share code, notes, and snippets.

@aj07mm
Created December 1, 2015 16:47
Show Gist options
  • Save aj07mm/c9f77084c8645dce1ca0 to your computer and use it in GitHub Desktop.
Save aj07mm/c9f77084c8645dce1ca0 to your computer and use it in GitHub Desktop.
recursive crawling
import re
import mechanize
#from bs4 import BeautifulSoup
class Crawler:
def __init__(self):
self.url_list = []
self.loop = 0
def get_url_regex(self, url):
return url + '(.*)/'
def get_links(self, url):
url_list = []
br = mechanize.Browser()
br.open(url)
for link in br.links(url_regex=self.get_url_regex(url)):
url_list.append(link.url)
return url_list
def crawl(self, url):
self.loop = self.loop + 1
links = self.get_links(url)
self.url_list = self.url_list + links
print self.loop
for link in self.get_links(url):
if link not in self.url_list:
self.crawl(link)
foo = Crawler()
foo.crawl("http://www.epocacosmeticos.com.br/")
print foo.url_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment