-
-
Save violetguos/b4ce98ac6831542004cecac492fef894 to your computer and use it in GitHub Desktop.
A simple tool to add the name of downloaded paper pdf's in front of the id. Also removes duplicate downloads of the same arxiv paper.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A simple tool to add the name of downloaded paper pdf's in front of the id. | |
(Written by [email protected]) | |
If there are multiple downloads of same paper, replaces the original with the | |
latest download. This can be useful in a downloads folder filled with copies. | |
For instance: | |
""" | |
import glob | |
import itertools | |
import os | |
import re | |
import shutil | |
from argparse import ArgumentParser | |
from collections import defaultdict | |
from pathlib import Path | |
from typing import Dict, Iterable, List, Optional | |
import arxiv | |
arxiv_id_regex = r"(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?" | |
def get_new_path(current_path: Path, paper_title: str) -> Path: | |
""" Gets the new path, of the form "<full-paper-title>_<paper.id>.pdf". """ | |
title = paper_title.lower() | |
title = "".join(c if c.isalpha() else " " for c in title) | |
title = "-".join(title.split()) | |
title += ("_" + current_path.stem) | |
return current_path.with_name(title + current_path.suffix) | |
def get_arxiv_id_from_filename(paper_pdf_path: Path) -> Optional[str]: | |
"""Returns the arxiv id of the associated pdf path, based on its filename. | |
Tries to find something matching the arxiv id regex expression defined above. | |
If found, returns the corresponding id as a string, else returns None. | |
Args: | |
paper_pdf_path (Path): A Path to a pdf file. | |
Returns: | |
Optional[str]: The associatedarxiv ID if available, else None. | |
""" | |
match = re.search(arxiv_id_regex, paper_pdf_path.stem) | |
if match: | |
return match.group(0) | |
return None | |
def get_id_to_title(arxiv_ids: Iterable[str]) -> Dict[str, str]: | |
"""Given some arxiv ids, returns a dict from arxiv id to paper title. | |
Args: | |
arxiv_id_to_path (Iterable[str]): An iterable of arxiv ids. | |
Returns: | |
Dict[str, str]: Dict from arxiv id to paper title. | |
""" | |
arxiv_id_to_title: Dict[str, str] = {} | |
for arxiv_result in arxiv.query(id_list=arxiv_ids): | |
title: str = arxiv_result.title | |
# get the "" | |
result_id: str = arxiv_result.id | |
# There might be some 'v1' 'v2' etc string at the end of the arxiv id. | |
# Find the arxiv id from the given list that matches with the result id. | |
for arxiv_id in arxiv_ids: | |
if arxiv_id in result_id: | |
arxiv_id_to_title[arxiv_id] = title | |
break | |
else: | |
print("Couldn't find an arxiv id that matches with the result arxiv id of", result_id) | |
continue | |
return arxiv_id_to_title | |
def remove_generated_files(previously_created_files: Dict[str, List[Path]], dryrun: bool): | |
"""Undo operation: resets the generated filenames to their default values. | |
WARNING: This will remove duplicates of files. For example, when given: | |
/1234.12345.pdf | |
/some-paper-title_1234.12345.pdf | |
/foo-something_1234.12345.pdf | |
will remove the lowermost two files. | |
Args: | |
previously_created_files (Dict[str, List[Path]]): A Dict mapping arxiv | |
id to a list of Paths of files generated by this script. | |
""" | |
for arxiv_id, new_paths in previously_created_files.items(): | |
# print("arxiv id:", arxiv_id, "new paths:", new_paths) | |
for new_path in new_paths: | |
old_path = new_path.with_name(arxiv_id + new_path.suffix) | |
if dryrun: | |
pass | |
elif old_path.is_file(): | |
# the old file exists: just deletes the generated files. | |
os.remove(new_path) | |
else: | |
new_path.replace(old_path) | |
print("Would Undo: " if dryrun else "Undo: ", old_path, "<--", new_path, sep="\t") | |
def main(paths: List[Path], replace_files: bool, force: bool, undo: bool, dryrun: bool): | |
# Maps from arxiv id to pdf files which only have this id as name. | |
files_with_only_id_as_name: Dict[str, Path] = {} | |
# Dictionary the autogenerated pdf files for each arxiv id. | |
# NOTE: there may be more than one PDF for the same arxiv id. | |
previously_created_files: Dict[str, List[Path]] = defaultdict(list) | |
start: str = "" | |
for path in paths: | |
## Or, for the cool kids: | |
# if (match := re.match(arxiv_id_regex, path.name)): | |
arxiv_id = get_arxiv_id_from_filename(path) | |
if not arxiv_id: | |
print("Ignoring pdf file at path: ", path) | |
# If multiple downloads of same paper, replaces the original with the latest version. | |
elif path.stem.startswith(arxiv_id): | |
if re.search(r"(\(\d+\))", path.stem): | |
old_path = path | |
new_path = path.with_name(arxiv_id + path.suffix) | |
if dryrun: | |
start = "Would replace:" | |
else: | |
start = "Replacing:" | |
old_path.replace(new_path) | |
print(start, path, " --> ", new_path, sep="\t") | |
path = new_path | |
files_with_only_id_as_name[arxiv_id] = path | |
elif path.stem.endswith(arxiv_id): | |
# File doesn't start with the arxiv id, but ends with it. | |
# It is therefore a previously created file! | |
previously_created_files[arxiv_id].append(path) | |
if undo: | |
remove_generated_files(previously_created_files, dryrun) | |
exit() | |
# Get the paper titles associated with each arxiv id: | |
arxiv_id_to_title = get_id_to_title(files_with_only_id_as_name.keys()) | |
for arxiv_id, title in arxiv_id_to_title.items(): | |
current_path = files_with_only_id_as_name[arxiv_id] | |
new_path = get_new_path(current_path, title) | |
# Only creates/replaces the pdf files if the new path doesn't already | |
# exist, except when the "--force" flag was passed. | |
if not force and (new_path.exists() and new_path.is_file()): | |
print("Skipping already-existing Path:", new_path, sep="\t") | |
continue | |
if replace_files: | |
if dryrun: | |
start = "Would replace:" | |
else: | |
start = "Replaced:" | |
current_path.replace(new_path) | |
else: | |
if dryrun: | |
start = "Would copy:" | |
else: | |
start = "Copied:" | |
shutil.copy(current_path, new_path) | |
print(start, current_path, " --> ", new_path, sep="\t") | |
if __name__ == "__main__": | |
parser = ArgumentParser(description=__doc__) | |
parser.add_argument("paths", type=lambda s: glob.glob(s), nargs="+", | |
help="Paths or glob pattern of pdf files to change.") | |
parser.add_argument("--replace-files", default=False, action="store_true", | |
help=("Wether to replace the files, or simply copy them to new paths. " | |
"By default, only creates copies of the files and stores them at " | |
"their new destinations.")) | |
parser.add_argument("-f", "--force", default=False, action="store_true", | |
help=("Wether or not to ignore/overwrite existing files when copying/replacing.")) | |
parser.add_argument("--undo", default=False, action="store_true", | |
help="Wether or not to undo the operation (i.e, recreate the original <xxxx.xxxxxx.pdf> files)") | |
parser.add_argument("-q", "--query", default=False, action="store_true", | |
help="Query only: doesn't create/replace any files, just shows what would happen.") | |
args = parser.parse_args() | |
paths: List[Path] = list(map(Path, itertools.chain(*args.paths))) | |
replace_files: bool = args.replace_files | |
force: bool = args.force or args.undo | |
undo: bool = args.undo | |
dryrun: bool = args.query | |
main(paths, replace_files, force, undo, dryrun) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment