Created
July 28, 2025 04:16
-
-
Save JotaRata/1aa3c0ab1036cff5349b648b918e4cea to your computer and use it in GitHub Desktop.
Organize rule for Astronomy papers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# organize configuration file | |
# https://organize.readthedocs.io | |
rules: | |
- name: Move and organize Scientific papers from the Downloads directory | |
locations: ~/Downloads | |
subfolders: false | |
filters: | |
- extension: pdf | |
- filecontent: (?si)^(?=.*(?:abstract|a\s*b\s*s\s*t\s*r\s*a\s*c\s*t))(?=.*(?:introduction|i\s*n\s*t\s*r\s*o\s*d\s*u\s*c\s*t\s*i\s*o\s*n))(?P<text>.*) | |
- python: | | |
import re, requests, unicodedata | |
import xml.etree.ElementTree as ET | |
def parse_names(authors): | |
names = [f'{a.split()[-1]}_{a.split()[0][0]}' for a in authors][0].replace("'",'') | |
names = ''.join(c for c in unicodedata.normalize('NFKD', names) if not unicodedata.combining(c)) | |
return names | |
def parse_title(title): | |
return title.replace('\n', '').replace(':', '').replace(' ','_').replace('/', '').replace('<scp>','') | |
text = filecontent.get("text") | |
cutoff = re.search(r'\b(references|r\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s|bibliography)\b', text, re.IGNORECASE) | |
if cutoff: | |
text = text[:cutoff.start()] | |
doi_match = re.search(r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+', text) | |
arxiv_match = re.search(r'arxiv[:\s]?\d{4}\.\d{4,5}(v\d+)?', text, re.I) | |
if not arxiv_match: | |
arxiv_match = re.search(r'arxiv[:\s]?astro-ph/\d{7}(v\d+)?', text, re.I) | |
if arxiv_match: | |
arxiv = arxiv_match.group().split(":")[-1] | |
print('Found ArXiv', arxiv) | |
resp = requests.get(f'http://export.arxiv.org/api/query?id_list={arxiv}') | |
if resp.status_code == 200: | |
prefix = '{http://www.w3.org/2005/Atom}' | |
root = ET.fromstring(resp.text) | |
entry = root.find(prefix+'entry') | |
title = entry.find(prefix+'title').text.strip() | |
authors = [a.find(prefix+'name').text for a in entry.findall(prefix+'author')] | |
year = entry.find(prefix+'published').text[:4] | |
if len(authors) > 2: | |
authors = authors[:2] + ['et al.'] | |
return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': arxiv } | |
elif doi_match: | |
doi = doi_match.group().rstrip('.,;\'"') | |
print('Found DOI', doi) | |
resp = requests.get(f'https://api.crossref.org/works/{doi}', timeout=5) | |
if resp.status_code == 200: | |
data = resp.json()['message'] | |
title = data.get('title', [' '])[0] | |
authors = [f"{a.get('given', '')} {a.get('family', '')}".strip() for a in data.get('author', [])] | |
year = data.get('issued', {}).get('date-parts', [[None]])[0][0] | |
return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': doi } | |
raise Exception('No valid arXiv or DOI found in main content') | |
actions: | |
- move: "~/Downloads/Papers/{python.authors}_{python.year}__{python.title}.pdf" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Organize Scientific Papers from the Downloads Directory
Automatically manage all those scientific papers you downloaded and forgot to read.
This script identifies potential scientific papers by scanning PDF contents for keywords like "abstract" and "introduction". It then looks for a DOI or arXiv ID in the text. To keep things clean, it stops parsing once it hits the bibliography.
Once a DOI or arXiv ID is found, it queries Crossref or arXiv to retrieve the paper's metadata (title, authors, year) and renames the file accordingly. The PDF is then moved to your dedicated papers folder with a clean, informative filename.
Requirements:
Organize v3.0.0 or higher
https://github.com/tfeldmann/organize
Installation.
Download the file and move it to:
$HOME/.config/organize
Usage
Create a folder for your papers. By default, the script uses
$HOME/Downloads/Papers
. If you want to change that, update the path in the config file accordingly.To preview the actions without modifying any files, run:
organize sim paper-cleanup
To execute the cleanup process, run:
organize run paper-cleanup