Skip to content

Instantly share code, notes, and snippets.

@Jdsleppy
Created October 4, 2024 23:29
Show Gist options
  • Save Jdsleppy/533bb3db6097ea7b432d9828312e1c1b to your computer and use it in GitHub Desktop.
Save Jdsleppy/533bb3db6097ea7b432d9828312e1c1b to your computer and use it in GitHub Desktop.
Scrape scimagojr.com and make a CSV of top 10 journals in each category
#!/usr/bin/env python3
## requirements.txt
# anyio==4.6.0
# beautifulsoup4==4.12.3
# certifi==2024.8.30
# h11==0.14.0
# httpcore==1.0.5
# httpx==0.27.2
# idna==3.10
# sniffio==1.3.1
# soupsieve==2.6
import csv
import dataclasses
import httpx
import hashlib
import itertools
import pathlib
from bs4 import BeautifulSoup, Tag
def request(url: str) -> str:
hash = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest()
response_file = pathlib.Path("cache", hash)
if response_file.exists():
return response_file.read_text()
response = httpx.get(url).raise_for_status()
response_file.write_text(response.text)
return response.text
@dataclasses.dataclass
class Category:
name: str
code: int
def url(self) -> str:
return f"https://www.scimagojr.com/journalrank.php?category={self.code}&out=xls"
def filepath(self) -> pathlib.Path:
hash = hashlib.md5(self.url().encode(), usedforsecurity=False).hexdigest()
return pathlib.Path("cache", hash)
def main():
homepage = BeautifulSoup(
request("https://www.scimagojr.com/journalrank.php"),
"html.parser",
)
ranking_controls = homepage.find(id="rankingcontrols")
categories_dropdown: Tag = ranking_controls.find_all("div", class_="dropdown")[1]
categories_list: Tag = categories_dropdown.find("ul")
categories_list_items: list[Tag] = categories_list.find_all("li")
categories_links: list[Tag] = [li.find("a") for li in categories_list_items]
categories = [
Category(name=a.text, code=int(a.attrs["data-code"])) for a in categories_links
]
categories = [c for c in categories if c.code != 0]
for c in categories:
print(f"Fetching {c.name}")
request(f"https://www.scimagojr.com/journalrank.php?category={c.code}&out=xls")
all_top_tens: list[dict] = []
for c in categories:
with c.filepath().open() as f:
reader = csv.DictReader(f, delimiter=";")
top_ten = itertools.islice(reader, 10)
for journal in top_ten:
issns = journal["Issn"].split(", ")
all_top_tens.append(
{
"category": c.name,
"title": journal["Title"],
"ISSN1": issns[0],
"ISSN2": issns[1] if len(issns) > 1 else "",
}
)
with pathlib.Path("output.csv").open("w") as f:
writer = csv.DictWriter(f, fieldnames=list(all_top_tens[0].keys()))
writer.writeheader()
writer.writerows(all_top_tens)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment