Created
April 19, 2019 14:41
-
-
Save phi10s/859fdaa3464897e6677cc98c9724b852 to your computer and use it in GitHub Desktop.
This simple Python script charts a path from any Wikipedia entry to the philosophy page. Like Six Degrees of Kevin Bacon, but for philosophy nerds.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
__author__="phi10s" | |
'''It is hypothesized that if you recursively click the first link | |
in the main body of any Wikipedia entry, you eventually will reach | |
the philosophy entry. This script maps the path from an arbitrary | |
entry to the Philosophy entry, and notes the number of hops required. | |
It's like Six Degrees of Kevin Bacon, but for philosophy nerds. | |
-phi10s | |
''' | |
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
query = sys.argv[1] | |
wiki_base_url = "https://en.wikipedia.org/" | |
initial_url = wiki_base_url + "wiki/" + query.strip() | |
loglist = [] | |
phil_dist = 0 | |
removes = 0 | |
print(initial_url) | |
# Recursive, because it's more philosophically interesting than iteration | |
def crawl(url, linknum): | |
global phil_dist | |
global removes | |
if linknum == 0: | |
phil_dist += 1 | |
repsonse = requests.get(url) | |
loglist.append(url) | |
soup = BeautifulSoup(repsonse.content, "lxml") | |
page_title = soup.select('#firstHeading')[0].text.encode('utf-8') | |
if page_title == "Philosophy": | |
print("Philosophy!") | |
print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1))) | |
exit(0) | |
text = soup.select('#mw-content-text') | |
# atags = text[0].select('p a') | |
paragraphs = text[0].select('p') | |
paragraph = paragraphs[0] | |
index = 0 | |
while len(paragraph.text) < 200: | |
index += 1 | |
paragraph = paragraphs[index] | |
atags = paragraph.select('a') | |
# print(atags[0]) | |
hrefs = [atag.get('href') for atag in atags] | |
'''The clunky and inelegant bit of code below is an attempt to filter out | |
links in the etymological section, as this is about the relation of concepts | |
in the main body. Wiki page HTML is not ideal to parse this in an elegant | |
manner, but there is probably a better way to do this.''' | |
links = [href for href in hrefs if href is not None and "/wiki/" in href \ | |
and ":" not in href and "Greek" not in href and "Latin" not in href \ | |
and "English" not in href and "Literal_translation" not in href] | |
# print(links[0]) | |
new_page_href = links[linknum] | |
new_page_url = wiki_base_url + new_page_href | |
print(new_page_url) | |
while new_page_url in loglist: | |
print("[-] Oh no, an infinite loop! Moving to next link.") | |
removes += 1 | |
linknum += 1 | |
crawl(url, linknum) | |
#print(page_title + "\n| %i" % phil_dist) | |
linknum = 0 | |
crawl(new_page_url, linknum) | |
crawl(initial_url, 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment