Skip to content

Instantly share code, notes, and snippets.

@Pangoraw
Last active January 6, 2025 13:13
Show Gist options
  • Save Pangoraw/2604e88c3743eb9fd53f1643cf4e5a8f to your computer and use it in GitHub Desktop.
Save Pangoraw/2604e88c3743eb9fd53f1643cf4e5a8f to your computer and use it in GitHub Desktop.
Download articles from the web when zotero has not synced them.
#!python
"""
Downloads Zotero files that are not synced on this laptop.
"""
import os
from pathlib import Path
import argparse
import sqlite3
import urllib.request
from dataclasses import dataclass
ZOTERO_DIR = Path("~/snap/zotero-snap/common/Zotero/").expanduser()
@dataclass
class ArticleFile:
itemID: int
attachmentItemID: int
title: str
path: str
key: str
def file_path(self, zotero_dir: Path) -> Path:
return zotero_dir / "storage" / self.key / self.path[len("storage:") :]
def exists(self, zotero_dir: Path) -> bool:
p = self.file_path(zotero_dir)
return p.exists()
def download(self, url: str, zotero_dir: Path):
file_name = self.file_path(zotero_dir)
dirname = file_name.parent
dirname.mkdir(exist_ok=True)
urllib.request.urlretrieve(url, file_name)
def get_files_articles(conn):
cursor = conn.execute(
"""
-- Getting file paths
SELECT items.itemID, attachmentItems.itemID as attachmentItemID, value as title, itemAttachments.path, attachmentItems.key FROM items
LEFT JOIN itemData, itemDataValues, itemAttachments, items attachmentItems
WHERE itemData.itemID = items.itemID
AND itemData.fieldID = 1
AND itemData.valueID = itemDataValues.valueID
AND itemAttachments.path LIKE 'storage:%'
AND itemAttachments.parentItemID = items.itemID
AND attachmentItems.itemID = itemAttachments.itemID
AND itemAttachments.contentType = 'application/pdf';
"""
)
return list(map(lambda x: ArticleFile(*x), cursor))
def get_url(conn, articles):
urls = []
for article in articles:
cursor = conn.execute(
"""
-- Get download url
SELECT value as url FROM items
LEFT JOIN itemData, itemDataValues
WHERE itemData.fieldID = 13
AND itemData.valueID = itemDataValues.valueID
AND itemData.itemID = items.itemID
-- AND itemDataValues.value LIKE '%arxiv%'
AND items.itemID = ?;
""",
(article.attachmentItemID,),
)
maybe_url = cursor.fetchone()
if maybe_url is None:
urls.append(None)
else:
urls.append(maybe_url[0])
return urls
def fetch_and_download(filters, zotero_dir: Path):
conn = sqlite3.connect(zotero_dir / "zotero.sqlite")
articles = get_files_articles(conn)
articles = [
a
for a in articles
if not a.exists(zotero_dir) and all(map(lambda f: f(a), filters))
]
urls = get_url(conn, articles)
for article, url in zip(articles, urls):
if url is None:
continue
print(f"Downloading {article.title}...", end="")
try:
article.download(url, zotero_dir)
print("✅", end="")
finally:
print()
print("Done!")
def main():
parser = argparse.ArgumentParser(prog="zotero downloader")
parser.add_argument(
"--filter",
required=False,
default="",
help="Filter for the articles to download",
)
parser.add_argument(
"--zotero_dir",
default=ZOTERO_DIR,
type=Path,
help="Path to directory containing zotero.sqlite",
)
args = parser.parse_args()
fetch_and_download(
[lambda a: args.filter.lower() in a.title.lower()], args.zotero_dir
)
if __name__ == "__main__":
main()
@Pangoraw
Copy link
Author

Pangoraw commented May 5, 2022

 $ python download_zotero.py --filter="Attention"
Downloading Training data-efficient image transformers & distillation through attention...✅
Downloading Attention Is All You Need...✅
Done!     

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment