Created
January 20, 2021 12:42
-
-
Save gullyn/73a607e70970d0b151bac5e7c2c2a7e5 to your computer and use it in GitHub Desktop.
analyze
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, requests, re | |
def main(): | |
print(page_length("Ivory_Coast", True)) | |
countries = json.loads(open("countries.json", "r").read()) | |
for country in countries: | |
link = country[1].split("wiki/")[1] | |
length = page_length(link, True) | |
print(country[0], length) | |
results = open("results.txt", "a") | |
results.write(f"{country[0]},{length}\n") | |
results.close() | |
def page_length(title, recursive=False): | |
req = requests.get(f"https://en.wikipedia.org/w/api.php?action=parse&page={title}&format=json") | |
content = json.loads(req.content)["parse"]["text"]["*"] | |
sum_len = len(content) | |
if not recursive: | |
return sum_len | |
links = get_links(content, title) | |
for link in links: | |
link_len = page_length(link) | |
sum_len += link_len | |
return sum_len | |
def get_links(content, title): | |
regex = r"Main articles?: (?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?)?(?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?){1,}" | |
matches = re.findall(regex, content) | |
new_matches = [] | |
for match in matches: | |
for rm in match: | |
if len(rm) > 0 and title.split(" ")[0].lower() in rm.lower(): | |
new_matches.append(rm.split("#")[0]) | |
return new_matches | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment