Created
December 3, 2022 12:44
-
-
Save ibaaj/da199911ec2b32c13d3389d245bb0cf5 to your computer and use it in GitHub Desktop.
parsing mathgenealogy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import re | |
import requests | |
from bs4 import BeautifulSoup, SoupStrainer | |
import time | |
import json | |
ALLAUTHORS = {} | |
EDGES = [] | |
ALREADYSCRAPED = [] | |
IDCURRENT = 125567 # starting from Langevin, # https://www.mathgenealogy.org/id.php?id=125567 | |
REMAININGIDTOSCRAP = [] | |
def parseIdsAndGetIdsName(idstart): | |
idsNames = {} | |
page = requests.get("https://www.mathgenealogy.org/id.php?id=" + str(idstart)) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
textAdvisor = soup.find('p', style=re.compile(r'text-align: center; line-height: 2.75ex')) | |
try: | |
Links = textAdvisor.find_all('a') | |
for x in Links: | |
id = x['href'].split('=')[1] | |
name = x.get_text().replace(" ", " ") | |
print("id = " + str(id)) | |
print("name = " + str(name)) | |
idsNames[id] = name | |
except: | |
return {} | |
return idsNames | |
IdsNameStart = parseIdsAndGetIdsName(IDCURRENT) | |
ALREADYSCRAPED.append(IdsNameStart) | |
for idscraped in IdsNameStart: | |
ALLAUTHORS[idscraped] = IdsNameStart[idscraped] | |
REMAININGIDTOSCRAP.append(idscraped) | |
while len(REMAININGIDTOSCRAP) != 0: | |
print("len of remaining ids to scrap: " + str(REMAININGIDTOSCRAP)) | |
popId = REMAININGIDTOSCRAP.pop() | |
if popId is in ALREADYSCRAPED: | |
continue | |
IdsNameNew = parseIdsAndGetIdsName(popId) | |
ALREADYSCRAPED.append(popId) | |
if len(IdsNameNew) == 0: | |
continue | |
else: | |
for idscraped in IdsNameNew: | |
ALLAUTHORS[idscraped] = IdsNameNew[idscraped] | |
if idscraped not in REMAININGIDTOSCRAP: | |
REMAININGIDTOSCRAP.append(idscraped) | |
if (popId,idscraped) not in EDGES: | |
EDGES.append((popId,idscraped)) | |
time.sleep(1) | |
with open("authors.json","w") as f: | |
f.write(json.dumps(ALLAUTHORS)) | |
with open('edges.txt', 'w') as f: | |
f.write(json.dumps(EDGES)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment