JotaRata · July 28, 2025 04:16 · JotaRata · Jul 28, 2025
diff --git a/paper-cleanup.yaml b/paper-cleanup.yaml
 # organize configuration file
 # https://organize.readthedocs.io

 rules:
  - name: Move and organize Scientific papers from the Downloads directory
    locations: ~/Downloads
    subfolders: false
    filters:
      - extension: pdf
      - filecontent: (?si)^(?=.*(?:abstract|a\s*b\s*s\s*t\s*r\s*a\s*c\s*t))(?=.*(?:introduction|i\s*n\s*t\s*r\s*o\s*d\s*u\s*c\s*t\s*i\s*o\s*n))(?P<text>.*)

      - python: |
            import re, requests, unicodedata
            import xml.etree.ElementTree as ET

            def parse_names(authors):
                names = [f'{a.split()[-1]}_{a.split()[0][0]}' for a in authors][0].replace("'",'')
                names = ''.join(c for c in unicodedata.normalize('NFKD', names) if not unicodedata.combining(c))
                return names

            def parse_title(title):
                return title.replace('\n', '').replace(':', '').replace(' ','_').replace('/', '').replace('<scp>','')
            
            text = filecontent.get("text")
            cutoff = re.search(r'\b(references|r\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s|bibliography)\b', text, re.IGNORECASE)

            if cutoff:
                text = text[:cutoff.start()]

            doi_match = re.search(r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+', text)
            arxiv_match = re.search(r'arxiv[:\s]?\d{4}\.\d{4,5}(v\d+)?', text, re.I)
            if not arxiv_match:
                arxiv_match = re.search(r'arxiv[:\s]?astro-ph/\d{7}(v\d+)?', text, re.I)

            if arxiv_match:
                arxiv = arxiv_match.group().split(":")[-1]
                print('Found ArXiv', arxiv)
                resp = requests.get(f'http://export.arxiv.org/api/query?id_list={arxiv}')
                if resp.status_code == 200:
                    prefix = '{http://www.w3.org/2005/Atom}'
                    root = ET.fromstring(resp.text)
                    entry = root.find(prefix+'entry')
                    title = entry.find(prefix+'title').text.strip()
                    authors = [a.find(prefix+'name').text for a in entry.findall(prefix+'author')]
                    year = entry.find(prefix+'published').text[:4]
                    if len(authors) > 2:
                        authors = authors[:2] + ['et al.']
                    return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': arxiv }

            elif doi_match:
                doi = doi_match.group().rstrip('.,;\'"')
                print('Found DOI', doi)
                resp = requests.get(f'https://api.crossref.org/works/{doi}', timeout=5)
                if resp.status_code == 200:
                    data = resp.json()['message']
                    title = data.get('title', [' '])[0]
                    authors = [f"{a.get('given', '')} {a.get('family', '')}".strip() for a in data.get('author', [])]
                    year = data.get('issued', {}).get('date-parts', [[None]])[0][0]
                    return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': doi }

            raise Exception('No valid arXiv or DOI found in main content')


    actions:
      -  move: "~/Downloads/Papers/{python.authors}_{python.year}__{python.title}.pdf"
	# organize configuration file
	# https://organize.readthedocs.io

	rules:
	- name: Move and organize Scientific papers from the Downloads directory
	locations: ~/Downloads
	subfolders: false
	filters:
	- extension: pdf
	- filecontent: (?si)^(?=.(?:abstract\|a\sb\ss\st\sr\sa\sc\st))(?=.(?:introduction\|i\sn\st\sr\so\sd\su\sc\st\si\so\sn))(?P<text>.*)

	- python: \|
	import re, requests, unicodedata
	import xml.etree.ElementTree as ET

	def parse_names(authors):
	names = [f'{a.split()[-1]}_{a.split()[0][0]}' for a in authors][0].replace("'",'')
	names = ''.join(c for c in unicodedata.normalize('NFKD', names) if not unicodedata.combining(c))
	return names

	def parse_title(title):
	return title.replace('\n', '').replace(':', '').replace(' ','_').replace('/', '').replace('<scp>','')

	text = filecontent.get("text")
	cutoff = re.search(r'\b(references\|r\se\sf\se\sr\se\sn\sc\se\s*s\|bibliography)\b', text, re.IGNORECASE)

	if cutoff:
	text = text[:cutoff.start()]

	doi_match = re.search(r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+', text)
	arxiv_match = re.search(r'arxiv[:\s]?\d{4}\.\d{4,5}(v\d+)?', text, re.I)
	if not arxiv_match:
	arxiv_match = re.search(r'arxiv[:\s]?astro-ph/\d{7}(v\d+)?', text, re.I)

	if arxiv_match:
	arxiv = arxiv_match.group().split(":")[-1]
	print('Found ArXiv', arxiv)
	resp = requests.get(f'http://export.arxiv.org/api/query?id_list={arxiv}')
	if resp.status_code == 200:
	prefix = '{http://www.w3.org/2005/Atom}'
	root = ET.fromstring(resp.text)
	entry = root.find(prefix+'entry')
	title = entry.find(prefix+'title').text.strip()
	authors = [a.find(prefix+'name').text for a in entry.findall(prefix+'author')]
	year = entry.find(prefix+'published').text[:4]
	if len(authors) > 2:
	authors = authors[:2] + ['et al.']
	return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': arxiv }

	elif doi_match:
	doi = doi_match.group().rstrip('.,;\'"')
	print('Found DOI', doi)
	resp = requests.get(f'https://api.crossref.org/works/{doi}', timeout=5)
	if resp.status_code == 200:
	data = resp.json()['message']
	title = data.get('title', [' '])[0]
	authors = [f"{a.get('given', '')} {a.get('family', '')}".strip() for a in data.get('author', [])]
	year = data.get('issued', {}).get('date-parts', [[None]])[0][0]
	return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': doi }

	raise Exception('No valid arXiv or DOI found in main content')


	actions:
	- move: "~/Downloads/Papers/{python.authors}_{python.year}__{python.title}.pdf"