phi10s · April 19, 2019 14:41
diff --git a/all_roads_to_philosophy.py b/all_roads_to_philosophy.py
 #!/usr/bin/python

 __author__="phi10s"

 '''It is hypothesized that if you recursively click the first link
 in the main body of any Wikipedia entry, you eventually will reach 
 the philosophy entry. This script maps the path from an arbitrary 
 entry to the Philosophy entry, and notes the number of hops required. 
 It's like Six Degrees of Kevin Bacon, but for philosophy nerds.

 -phi10s
 '''

 import requests
 from bs4 import BeautifulSoup
 import sys

 query = sys.argv[1]
 wiki_base_url = "https://en.wikipedia.org/"
 initial_url = wiki_base_url + "wiki/" + query.strip()
 loglist = []
 phil_dist = 0
 removes = 0
 print(initial_url)

 # Recursive, because it's more philosophically interesting than iteration
 def crawl(url, linknum):
 	global phil_dist
 	global removes
 	if linknum == 0:
 		phil_dist += 1
 	repsonse = requests.get(url)
 	loglist.append(url)
 	soup = BeautifulSoup(repsonse.content, "lxml")
 	page_title = soup.select('#firstHeading')[0].text.encode('utf-8')
 	if page_title == "Philosophy":
 		print("Philosophy!")
 		print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1)))
 		exit(0)
 	text = soup.select('#mw-content-text')
 	# atags = text[0].select('p a')
 	paragraphs = text[0].select('p')
 	paragraph = paragraphs[0]
 	index = 0
 	while len(paragraph.text) < 200:
 		index += 1
 		paragraph = paragraphs[index]
 	atags = paragraph.select('a')
 	# print(atags[0])
 	hrefs = [atag.get('href') for atag in atags]
 	'''The clunky and inelegant bit of code below is an attempt to filter out
 	links in the etymological section, as this is about the relation of concepts
 	in the main body. Wiki page HTML is not ideal to parse this in an elegant
 	manner, but there is probably a better way to do this.'''
 	links = [href for href in hrefs if href is not None and "/wiki/" in href \
 		and ":" not in href and "Greek" not in href and "Latin" not in href \
 		and "English" not in href and "Literal_translation" not in href]
 	# print(links[0])
 	new_page_href = links[linknum]
 	new_page_url = wiki_base_url + new_page_href
 	print(new_page_url)
 	while new_page_url in loglist:
 		print("[-] Oh no, an infinite loop! Moving to next link.")
 		removes += 1
 		linknum += 1
 		crawl(url, linknum)
 	#print(page_title + "\n| %i" % phil_dist)
 	linknum = 0
 	crawl(new_page_url, linknum)

 crawl(initial_url, 0)
	#!/usr/bin/python

	__author__="phi10s"

	'''It is hypothesized that if you recursively click the first link
	in the main body of any Wikipedia entry, you eventually will reach
	the philosophy entry. This script maps the path from an arbitrary
	entry to the Philosophy entry, and notes the number of hops required.
	It's like Six Degrees of Kevin Bacon, but for philosophy nerds.

	-phi10s
	'''

	import requests
	from bs4 import BeautifulSoup
	import sys

	query = sys.argv[1]
	wiki_base_url = "https://en.wikipedia.org/"
	initial_url = wiki_base_url + "wiki/" + query.strip()
	loglist = []
	phil_dist = 0
	removes = 0
	print(initial_url)

	# Recursive, because it's more philosophically interesting than iteration
	def crawl(url, linknum):
	global phil_dist
	global removes
	if linknum == 0:
	phil_dist += 1
	repsonse = requests.get(url)
	loglist.append(url)
	soup = BeautifulSoup(repsonse.content, "lxml")
	page_title = soup.select('#firstHeading')[0].text.encode('utf-8')
	if page_title == "Philosophy":
	print("Philosophy!")
	print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1)))
	exit(0)
	text = soup.select('#mw-content-text')
	# atags = text[0].select('p a')
	paragraphs = text[0].select('p')
	paragraph = paragraphs[0]
	index = 0
	while len(paragraph.text) < 200:
	index += 1
	paragraph = paragraphs[index]
	atags = paragraph.select('a')
	# print(atags[0])
	hrefs = [atag.get('href') for atag in atags]
	'''The clunky and inelegant bit of code below is an attempt to filter out
	links in the etymological section, as this is about the relation of concepts
	in the main body. Wiki page HTML is not ideal to parse this in an elegant
	manner, but there is probably a better way to do this.'''
	links = [href for href in hrefs if href is not None and "/wiki/" in href \
	and ":" not in href and "Greek" not in href and "Latin" not in href \
	and "English" not in href and "Literal_translation" not in href]
	# print(links[0])
	new_page_href = links[linknum]
	new_page_url = wiki_base_url + new_page_href
	print(new_page_url)
	while new_page_url in loglist:
	print("[-] Oh no, an infinite loop! Moving to next link.")
	removes += 1
	linknum += 1
	crawl(url, linknum)
	#print(page_title + "\n\| %i" % phil_dist)
	linknum = 0
	crawl(new_page_url, linknum)

	crawl(initial_url, 0)