Last active
January 6, 2025 13:13
-
-
Save Pangoraw/2604e88c3743eb9fd53f1643cf4e5a8f to your computer and use it in GitHub Desktop.
Download articles from the web when zotero has not synced them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python | |
""" | |
Downloads Zotero files that are not synced on this laptop. | |
""" | |
import os | |
from pathlib import Path | |
import argparse | |
import sqlite3 | |
import urllib.request | |
from dataclasses import dataclass | |
ZOTERO_DIR = Path("~/snap/zotero-snap/common/Zotero/").expanduser() | |
@dataclass | |
class ArticleFile: | |
itemID: int | |
attachmentItemID: int | |
title: str | |
path: str | |
key: str | |
def file_path(self, zotero_dir: Path) -> Path: | |
return zotero_dir / "storage" / self.key / self.path[len("storage:") :] | |
def exists(self, zotero_dir: Path) -> bool: | |
p = self.file_path(zotero_dir) | |
return p.exists() | |
def download(self, url: str, zotero_dir: Path): | |
file_name = self.file_path(zotero_dir) | |
dirname = file_name.parent | |
dirname.mkdir(exist_ok=True) | |
urllib.request.urlretrieve(url, file_name) | |
def get_files_articles(conn): | |
cursor = conn.execute( | |
""" | |
-- Getting file paths | |
SELECT items.itemID, attachmentItems.itemID as attachmentItemID, value as title, itemAttachments.path, attachmentItems.key FROM items | |
LEFT JOIN itemData, itemDataValues, itemAttachments, items attachmentItems | |
WHERE itemData.itemID = items.itemID | |
AND itemData.fieldID = 1 | |
AND itemData.valueID = itemDataValues.valueID | |
AND itemAttachments.path LIKE 'storage:%' | |
AND itemAttachments.parentItemID = items.itemID | |
AND attachmentItems.itemID = itemAttachments.itemID | |
AND itemAttachments.contentType = 'application/pdf'; | |
""" | |
) | |
return list(map(lambda x: ArticleFile(*x), cursor)) | |
def get_url(conn, articles): | |
urls = [] | |
for article in articles: | |
cursor = conn.execute( | |
""" | |
-- Get download url | |
SELECT value as url FROM items | |
LEFT JOIN itemData, itemDataValues | |
WHERE itemData.fieldID = 13 | |
AND itemData.valueID = itemDataValues.valueID | |
AND itemData.itemID = items.itemID | |
-- AND itemDataValues.value LIKE '%arxiv%' | |
AND items.itemID = ?; | |
""", | |
(article.attachmentItemID,), | |
) | |
maybe_url = cursor.fetchone() | |
if maybe_url is None: | |
urls.append(None) | |
else: | |
urls.append(maybe_url[0]) | |
return urls | |
def fetch_and_download(filters, zotero_dir: Path): | |
conn = sqlite3.connect(zotero_dir / "zotero.sqlite") | |
articles = get_files_articles(conn) | |
articles = [ | |
a | |
for a in articles | |
if not a.exists(zotero_dir) and all(map(lambda f: f(a), filters)) | |
] | |
urls = get_url(conn, articles) | |
for article, url in zip(articles, urls): | |
if url is None: | |
continue | |
print(f"Downloading {article.title}...", end="") | |
try: | |
article.download(url, zotero_dir) | |
print("✅", end="") | |
finally: | |
print() | |
print("Done!") | |
def main(): | |
parser = argparse.ArgumentParser(prog="zotero downloader") | |
parser.add_argument( | |
"--filter", | |
required=False, | |
default="", | |
help="Filter for the articles to download", | |
) | |
parser.add_argument( | |
"--zotero_dir", | |
default=ZOTERO_DIR, | |
type=Path, | |
help="Path to directory containing zotero.sqlite", | |
) | |
args = parser.parse_args() | |
fetch_and_download( | |
[lambda a: args.filter.lower() in a.title.lower()], args.zotero_dir | |
) | |
if __name__ == "__main__": | |
main() |
Author
Pangoraw
commented
May 5, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment