Created
October 4, 2024 23:29
-
-
Save Jdsleppy/533bb3db6097ea7b432d9828312e1c1b to your computer and use it in GitHub Desktop.
Scrape scimagojr.com and make a CSV of top 10 journals in each category
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
## requirements.txt | |
# anyio==4.6.0 | |
# beautifulsoup4==4.12.3 | |
# certifi==2024.8.30 | |
# h11==0.14.0 | |
# httpcore==1.0.5 | |
# httpx==0.27.2 | |
# idna==3.10 | |
# sniffio==1.3.1 | |
# soupsieve==2.6 | |
import csv | |
import dataclasses | |
import httpx | |
import hashlib | |
import itertools | |
import pathlib | |
from bs4 import BeautifulSoup, Tag | |
def request(url: str) -> str: | |
hash = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest() | |
response_file = pathlib.Path("cache", hash) | |
if response_file.exists(): | |
return response_file.read_text() | |
response = httpx.get(url).raise_for_status() | |
response_file.write_text(response.text) | |
return response.text | |
@dataclasses.dataclass | |
class Category: | |
name: str | |
code: int | |
def url(self) -> str: | |
return f"https://www.scimagojr.com/journalrank.php?category={self.code}&out=xls" | |
def filepath(self) -> pathlib.Path: | |
hash = hashlib.md5(self.url().encode(), usedforsecurity=False).hexdigest() | |
return pathlib.Path("cache", hash) | |
def main(): | |
homepage = BeautifulSoup( | |
request("https://www.scimagojr.com/journalrank.php"), | |
"html.parser", | |
) | |
ranking_controls = homepage.find(id="rankingcontrols") | |
categories_dropdown: Tag = ranking_controls.find_all("div", class_="dropdown")[1] | |
categories_list: Tag = categories_dropdown.find("ul") | |
categories_list_items: list[Tag] = categories_list.find_all("li") | |
categories_links: list[Tag] = [li.find("a") for li in categories_list_items] | |
categories = [ | |
Category(name=a.text, code=int(a.attrs["data-code"])) for a in categories_links | |
] | |
categories = [c for c in categories if c.code != 0] | |
for c in categories: | |
print(f"Fetching {c.name}") | |
request(f"https://www.scimagojr.com/journalrank.php?category={c.code}&out=xls") | |
all_top_tens: list[dict] = [] | |
for c in categories: | |
with c.filepath().open() as f: | |
reader = csv.DictReader(f, delimiter=";") | |
top_ten = itertools.islice(reader, 10) | |
for journal in top_ten: | |
issns = journal["Issn"].split(", ") | |
all_top_tens.append( | |
{ | |
"category": c.name, | |
"title": journal["Title"], | |
"ISSN1": issns[0], | |
"ISSN2": issns[1] if len(issns) > 1 else "", | |
} | |
) | |
with pathlib.Path("output.csv").open("w") as f: | |
writer = csv.DictWriter(f, fieldnames=list(all_top_tens[0].keys())) | |
writer.writeheader() | |
writer.writerows(all_top_tens) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment