Jdsleppy · October 4, 2024 23:29
diff --git a/main.py b/main.py
 #!/usr/bin/env python3

 ## requirements.txt
 # anyio==4.6.0
 # beautifulsoup4==4.12.3
 # certifi==2024.8.30
 # h11==0.14.0
 # httpcore==1.0.5
 # httpx==0.27.2
 # idna==3.10
 # sniffio==1.3.1
 # soupsieve==2.6

 import csv
 import dataclasses
 import httpx
 import hashlib
 import itertools
 import pathlib

 from bs4 import BeautifulSoup, Tag


 def request(url: str) -> str:
    hash = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest()
    response_file = pathlib.Path("cache", hash)
    if response_file.exists():
        return response_file.read_text()

    response = httpx.get(url).raise_for_status()
    response_file.write_text(response.text)
    return response.text


 @dataclasses.dataclass
 class Category:
    name: str
    code: int

    def url(self) -> str:
        return f"https://www.scimagojr.com/journalrank.php?category={self.code}&out=xls"

    def filepath(self) -> pathlib.Path:
        hash = hashlib.md5(self.url().encode(), usedforsecurity=False).hexdigest()
        return pathlib.Path("cache", hash)


 def main():
    homepage = BeautifulSoup(
        request("https://www.scimagojr.com/journalrank.php"),
        "html.parser",
    )
    ranking_controls = homepage.find(id="rankingcontrols")
    categories_dropdown: Tag = ranking_controls.find_all("div", class_="dropdown")[1]
    categories_list: Tag = categories_dropdown.find("ul")
    categories_list_items: list[Tag] = categories_list.find_all("li")
    categories_links: list[Tag] = [li.find("a") for li in categories_list_items]
    categories = [
        Category(name=a.text, code=int(a.attrs["data-code"])) for a in categories_links
    ]
    categories = [c for c in categories if c.code != 0]

    for c in categories:
        print(f"Fetching {c.name}")
        request(f"https://www.scimagojr.com/journalrank.php?category={c.code}&out=xls")

    all_top_tens: list[dict] = []

    for c in categories:
        with c.filepath().open() as f:
            reader = csv.DictReader(f, delimiter=";")
            top_ten = itertools.islice(reader, 10)
            for journal in top_ten:
                issns = journal["Issn"].split(", ")
                all_top_tens.append(
                    {
                        "category": c.name,
                        "title": journal["Title"],
                        "ISSN1": issns[0],
                        "ISSN2": issns[1] if len(issns) > 1 else "",
                    }
                )

    with pathlib.Path("output.csv").open("w") as f:
        writer = csv.DictWriter(f, fieldnames=list(all_top_tens[0].keys()))
        writer.writeheader()
        writer.writerows(all_top_tens)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	## requirements.txt
	# anyio==4.6.0
	# beautifulsoup4==4.12.3
	# certifi==2024.8.30
	# h11==0.14.0
	# httpcore==1.0.5
	# httpx==0.27.2
	# idna==3.10
	# sniffio==1.3.1
	# soupsieve==2.6

	import csv
	import dataclasses
	import httpx
	import hashlib
	import itertools
	import pathlib

	from bs4 import BeautifulSoup, Tag


	def request(url: str) -> str:
	hash = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest()
	response_file = pathlib.Path("cache", hash)
	if response_file.exists():
	return response_file.read_text()

	response = httpx.get(url).raise_for_status()
	response_file.write_text(response.text)
	return response.text


	@dataclasses.dataclass
	class Category:
	name: str
	code: int

	def url(self) -> str:
	return f"https://www.scimagojr.com/journalrank.php?category={self.code}&out=xls"

	def filepath(self) -> pathlib.Path:
	hash = hashlib.md5(self.url().encode(), usedforsecurity=False).hexdigest()
	return pathlib.Path("cache", hash)


	def main():
	homepage = BeautifulSoup(
	request("https://www.scimagojr.com/journalrank.php"),
	"html.parser",
	)
	ranking_controls = homepage.find(id="rankingcontrols")
	categories_dropdown: Tag = ranking_controls.find_all("div", class_="dropdown")[1]
	categories_list: Tag = categories_dropdown.find("ul")
	categories_list_items: list[Tag] = categories_list.find_all("li")
	categories_links: list[Tag] = [li.find("a") for li in categories_list_items]
	categories = [
	Category(name=a.text, code=int(a.attrs["data-code"])) for a in categories_links
	]
	categories = [c for c in categories if c.code != 0]

	for c in categories:
	print(f"Fetching {c.name}")
	request(f"https://www.scimagojr.com/journalrank.php?category={c.code}&out=xls")

	all_top_tens: list[dict] = []

	for c in categories:
	with c.filepath().open() as f:
	reader = csv.DictReader(f, delimiter=";")
	top_ten = itertools.islice(reader, 10)
	for journal in top_ten:
	issns = journal["Issn"].split(", ")
	all_top_tens.append(
	{
	"category": c.name,
	"title": journal["Title"],
	"ISSN1": issns[0],
	"ISSN2": issns[1] if len(issns) > 1 else "",
	}
	)

	with pathlib.Path("output.csv").open("w") as f:
	writer = csv.DictWriter(f, fieldnames=list(all_top_tens[0].keys()))
	writer.writeheader()
	writer.writerows(all_top_tens)


	if __name__ == "__main__":
	main()