Last active
April 27, 2020 10:21
-
-
Save m4lvin/37bf473ee4e662fa031fc267c67464a9 to your computer and use it in GitHub Desktop.
Crawl the Mathematics Genealogy Project backwards, starting with two IDs and stopping when a common ancestor is found.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
''' | |
Crawl the Mathematics Genealogy Project backwards, starting with two IDs and stopping when a common ancestor is found. | |
The output without the lines tarting with "[" is a graph in DOT format for graphviz. | |
Based on a script by filip (2012-07-25) from https://github.com/eakbas/mathgen/ | |
Edited by m4lvin (2019-05-08). | |
''' | |
import re | |
import sys | |
import time | |
import requests | |
import queue | |
# change this to the two IDs for which you want to find a common ancestor. | |
idA = "258902" | |
idB = "235210" | |
prefix = "https://genealogy.math.ndsu.nodak.edu/id.php?id=" | |
def get_page(url): | |
try: | |
req = requests.get(url) | |
return req.text | |
except: | |
return "get_page(url) failed" | |
def remove_excess_space(s): | |
return re.sub(r'\s+', " ", s.strip()) | |
def get_name(page): | |
search_result = re.search(r'(.*)</h2>', page) | |
return search_result.group(1) if search_result != None else "NOT FOUND" | |
def get_year(page): | |
search_result = re.search(r'(\d{4})</span>', page) | |
return search_result.group(1) if search_result != None else "NOT FOUND" | |
def get_advisor_ids(page): | |
advs = re.findall(r'Advisor.*?:.*?<a href="id\.php\?id=(\d*?)">', page) | |
otrs = re.findall(r'otor.*?:.*?<a href="id\.php\?id=(\d*?)">', page) | |
return (advs + otrs) | |
def crawl(cache,todo,math_id): | |
if math_id not in cache: | |
page = get_page(prefix + math_id) | |
name = remove_excess_space(get_name(page)) | |
year = remove_excess_space(get_year(page)) | |
print('{0} "{1} {2}"'.format(math_id, name, year)) | |
cache[math_id] = name+"\n"+year | |
for advisor_id in get_advisor_ids(page): | |
print('{0} -> {1};'.format(advisor_id, math_id)) | |
todo.put(advisor_id) | |
# be nice to MathGenealogy servers and wait a second after each query | |
time.sleep(1) | |
return cache[math_id] | |
def connected(c1, c2): | |
overlap = c1.keys() & c2.keys() | |
if overlap: | |
print("connected!") | |
print(overlap) | |
return True | |
else: | |
return False | |
if __name__ == '__main__': | |
cache1 = {} | |
cache2 = {} | |
todo1 = queue.Queue() | |
todo2 = queue.Queue() | |
todo1.put(idA) | |
todo2.put(idB) | |
while not todo1.empty() and not connected(cache1, cache2): | |
print ("[current status:", todo1.qsize(), todo2.qsize()) | |
sys.stdout.flush() | |
crawl(cache1,todo1,todo1.get()) | |
if not todo2.empty(): | |
crawl(cache2,todo2,todo2.get()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment