Skip to content

Instantly share code, notes, and snippets.

@dagoof
Created October 19, 2010 22:50
Show Gist options
  • Save dagoof/635315 to your computer and use it in GitHub Desktop.
Save dagoof/635315 to your computer and use it in GitHub Desktop.
import functools, json
from siteparse import IndividualPage
GRAPH={'a':['b','c'], 'b': ['d'], 'c': ['d', 'a'], 'd': ['b', 'c']}
import urllib2, BeautifulSoup, urlparse
class IndividualPage(object):
def __init__(self, url):
self.source=urllib2.urlopen(url)
self.soup=BeautifulSoup.BeautifulSoup(self.source)
self._domain=urlparse.urlsplit(self.source.geturl()).netloc
def get_domain_specific_links(self, domain=None):
domain=domain or self._domain
for link in self.soup.findAll('a'):
if urlparse.urlsplit(link.get('href', '')).netloc == domain:
yield link.get('href')
class Bfs(object):
def __init__(self,
graph = GRAPH,
visit_f = lambda x,y: x.get(y),
to_visit = [],
visited = []):
self._visit_f = visit_f
self._graph = graph
self._to_visit = to_visit or self._graph.keys()[:1]
self._visited = visited
def explore(self):
while self._to_visit:
current_node=self._to_visit.pop()
if self._graph.get(current_node) == None:
self._graph[current_node] = set()
edges = self._graph.get(current_node)
self._visited.extend((current_node,))
self._graph[current_node].update(self.visit(current_node))
safe_to_visit = (n for n in self.visit(current_node) if
n not in self._visited and
n not in self._to_visit)
self._to_visit.extend(safe_to_visit)
def visit(self, node):
for edge in self._visit_f(self._graph, node):
yield edge
sitebfs = Bfs({},
lambda _,y: IndividualPage(y).get_domain_specific_links(),
['http://www.battle.net/',])
sitebfs.explore()
with open('hahashit.dat', 'w') as fout:
g=sitebfs._graph
g=dict((k, list(v)) for k,v in g.items())
fout.write(json.dumps(g, indent=4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment