Created
January 29, 2016 23:31
-
-
Save controversial/6de2f9c22e3fd4865f06 to your computer and use it in GitHub Desktop.
Wikipedia Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Scrapes wikipedia. Start with the name of a page. Then, it will click the first | |
#5 links on this page. For each of these links, it will click the first 5 links | |
#on *that* page. It will not stray more than 5 pages away from the start page. | |
#These attributes can be adjusted by changing BREADTH and MAXDEPTH. This script | |
#will output a PNG file of your wikipedia map. | |
#REQUIREMENTS: `wikipedia` and `pydot` | |
import wikipedia as wp | |
import pydot | |
def ascii(inp): | |
return str(inp.encode("ascii",errors="ignore")) | |
class WikiScraper: | |
def __init__(self, startpage,maxbreadth=10): | |
self.map={} | |
self.startpage=startpage | |
self.maxbreadth=maxbreadth | |
self.maxdepth=0 | |
self.graph=pydot.Dot(graph_type="graph") | |
def connect(self,parent,children): | |
self.map[parent] = children | |
for child in children: | |
edge=pydot.Edge(ascii(parent),ascii(child)) | |
self.graph.add_edge(edge) | |
def explore(self,pagename,depth): | |
#Return if we've exceeded max depth. | |
if depth==self.maxdepth: | |
return | |
#Return if we've already visited a page | |
if pagename in self.map.keys(): | |
return | |
try: | |
page=wp.page(pagename) | |
except wp.exceptions.DisambiguationError: | |
#Return in the event of reaching a disambiguation page | |
return | |
except wp.exceptions.PageError: | |
#We've tried to find a page that doesn't exist | |
print "The page {} could not be found".format(pagename) | |
return | |
print "Exploring "+pagename.encode("utf-8")+" at depth "+str(depth) | |
links=page.links[:self.maxbreadth] | |
self.connect(pagename,links) | |
for link in links: | |
self.explore(link,depth+1) | |
def start(self,maxdepth=0): | |
self.maxdepth=maxdepth | |
self.explore(self.startpage,1) | |
if __name__ == "__main__": | |
STARTPAGE=raw_input("Name of page to start at: ") | |
BREADTH=5 | |
MAXDEPTH=5 | |
w=WikiScraper(STARTPAGE,BREADTH) | |
w.start(MAXDEPTH) | |
w.graph.write_png(STARTPAGE+".png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment