Last active
September 6, 2024 18:58
-
-
Save daeh/abc6d46d897b58a657699fa1a408573e to your computer and use it in GitHub Desktop.
Import Papers 3 library into Zotero
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
"""Script to facilitate the import of a Readcube Papers 3 library into Zotero | |
__Purpose of this script__ | |
If you export your Readcube (Mekentosj) Papers3 library as a BibTeX file, the file paths to the PDFs are not formatted | |
correctly for Zotero to import them. | |
The specific issues include that: | |
* Papers3 does not export the file paths in a way that Zotero can understand. | |
* Papers3 does not export the paths to supplementary files, so only the primary PDF is imported into Zotero. | |
* Papers3 will export the primary PDF multiple times so you'll end up with multiple copies of the same PDF in Zotero. | |
* Papers3 includes superfluous supplementary files that you typically don't want to import into Zotero (e.g. *.html and | |
*.webarchive files). | |
This script will take the BibTeX file you exported from Papers3 and modify the file paths so that they can be imported into | |
Zotero. | |
__Usage__ | |
This script takes as input a BibTeX library exported from readcube/mekentosj Papers3 and outputs a BibTeX library for Zotero | |
to import. | |
The script preserves your Papers citekeys, adds supplementary files from the Papers3 Library, removes duplicate links to | |
PDFs, and removes extraneous *.html and *.webarchive files that are often created by importing articles into Paper from | |
a web browser. | |
__Instructions__ | |
* Make sure to have Better BibTeX pre-installed to Zotero if you want to preserve the Papers citekeys. | |
* Export your Papers3 library as a *.bib file. | |
Export > BibTeX Library | |
Make sure to set the "BibTex Record" option to "Complete". This will cause papers to include the paths to the main PDF | |
(or whatever) file in the *.bib export | |
* Run this script with python 3.7 or higher to generate the file, 'zotero_import.bib', in the same location as the BibTeX | |
library export. | |
* You can pass the script the paths to the Papers3 library and the BibTeX library export as command line arguments, | |
e.g.: | |
python Papers3_to_Zotero.py --papers "~/Documents/Library.papers3" --bibtex "~/Desktop/Library.bib" | |
* Or you can modify the script by updating the 'papers_lib_hardcoded' and 'bibtex_lib_hardcoded' variables with the | |
paths to your Papers3 library and the BibTeX library that you just exported. E.g.: | |
papers_lib_hardcoded = "~/Documents/User Library/Library.papers3" ### Path to Papers3 Library | |
bibtex_lib_hardcoded = "~/Desktop/full_library_export.bib" ### Path to Papers BibTeX library export | |
* Running the script will generate a new BibTeX file, 'zotero_import.bib', in the same location as the BibTeX library | |
export. | |
* Import the 'zotero_import.bib' file that gets generated with Zotero. | |
* Be sure to check the 'Import errors found:' file if Zotero generates one (if it exists, it will be in whatever folder you | |
imported the library to; sort by title to find it). | |
* Also check that special characters in titles and journal names were imported correctly. Sometimes '{\&}' in the | |
zotero_import.bib will be imported as '<span class="nocase">&</span>'. I'm not sure why or when this happens. You can | |
search for "</span>" to check. | |
__NOTE__ | |
The Collections groupings are not preserved with this method. This is one way to manually get your Papers3 Collections into | |
Zotero after following the above instructions: | |
* Export each collection as a BibTex library ("Export" set to "Selected Collection" and "BibTex Record" set to "Standard"). | |
This will prevent any file paths from being included in the *.bib file. | |
* Import that *.bib file directly to Zotero with the option to "Place imported collections and items into new collection" | |
selected. | |
* Then merge the duplicate records. That will give you a new collection with links to right papers from your Zotero library. | |
* In this strategy, you have to do that for each one of your Papers3 Collections. Not ideal but maybe tolerable. | |
__Author__ | |
Dae Houlihan | |
__Source__ | |
https://gist.github.com/daeh/abc6d46d897b58a657699fa1a408573e | |
""" | |
import argparse | |
import re | |
import sys | |
from pathlib import Path | |
from warnings import warn | |
def main(papers=None, bibtex=None): | |
################################################ | |
### Update these paths or pass via command line: | |
################################################ | |
### Path to Papers3 Library ### | |
papers_lib_hardcoded = "~/Documents/Library.papers3" | |
### Path to the BibTeX export of the Papers3 Library ### | |
bibtex_lib_hardcoded = "~/Desktop/library.bib" | |
################################################ | |
papers_lib = papers_lib_hardcoded if papers is None else papers | |
bibtex_lib = bibtex_lib_hardcoded if bibtex is None else bibtex | |
papers_library = Path(papers_lib).expanduser() | |
bibtex_library = Path(bibtex_lib).expanduser() | |
papers_library_string = str(papers_library).replace(r"(", r"\(").replace(r")", r"\)") + r"/" | |
if papers_library_string[-9:] != ".papers3/": | |
raise Exception( | |
f"The variable 'papers_library' should end in with '.papers3' but is rather: \n\t{str(papers_library)}" | |
) | |
if not papers_library.is_dir(): | |
raise Exception( | |
f"The path you provided to the Papers3 library does not seem to exist or is not a directory: \n\t{str(papers_library)}" | |
) | |
if not (bibtex_library.is_file() and bibtex_library.suffix == ".bib"): | |
raise Exception( | |
f"The path you provided to the BibTeX Library file you exported from Papers3 does not seem to exist or is not '.bib' file: \n\t{str(bibtex_library)}" | |
) | |
out, missing = list(), list() | |
with open(bibtex_library, "r") as btlib: | |
for line in btlib: | |
if line.startswith("file = {"): | |
templine = re.sub(r"^file = {{(.*?)}},?", r"file = {\1},", line, flags=re.M) | |
newline = re.sub(r"^file = {(.*?);(\1)},?", r"file = {\1},", templine, flags=re.M) | |
assert ";" not in newline # assert that this line references only one file | |
search_str = r"^file = {.*?:" + papers_library_string + r"(.*?\..*?):(.*?/.*?)},?" | |
filepath_relative = re.search(search_str, newline) | |
assert isinstance( | |
filepath_relative, re.Match | |
), f"Unable to match regex expression:: \n{search_str} \nwith entry from BibTex:: \n{newline}" | |
primary_file_path = papers_library / filepath_relative.group(1) | |
if not primary_file_path.is_file(): | |
warn(f"The linked file was not found: {primary_file_path}", UserWarning) | |
missing.append(primary_file_path) | |
supp_files = list() | |
for dir_extra in ["Supplemental", "Media"]: | |
supp_dir = primary_file_path.parents[0] / dir_extra | |
if supp_dir.exists(): | |
for x in supp_dir.iterdir(): | |
if ( | |
x.is_file() | |
and x.suffix not in [".html", ".webarchive"] | |
and str(x) != str(primary_file_path) | |
): | |
supp_files.append(x) | |
if len(supp_files) > 0: | |
search_str_supp = ( | |
r"(^file = {.*?:" + papers_library_string + r".*?\..*?:application/.*?)},?" | |
) | |
primary_line = re.search(search_str_supp, newline) | |
assert isinstance( | |
primary_line, re.Match | |
), f"Unable to match regex expression:: \n{search_str_supp} \nwith entry from BibTex:: \n{newline}" | |
newline = primary_line.group(1) | |
for x in supp_files: | |
print(f"adding supplementary file for {x.name}") | |
newline += f';{x.with_suffix("").name + " Supp" + x.suffix}:{x}:application/{x.suffix}' | |
newline += "},\n" | |
out.append(newline) | |
else: | |
out.append(line) | |
### New BibTeX record to import into Zotero | |
modified_lib = bibtex_library.parents[0] / "zotero_import.bib" | |
with open(modified_lib, "w", encoding="utf-8") as outfile: | |
for item in out: | |
outfile.write(item) | |
if missing: | |
print("\n\nList of missing files::\n") | |
for mf in missing: | |
print(mf) | |
print( | |
f"\n\nScript completed but {len(missing)} files referenced in the BibTeX library were not located. They are listed above." | |
) | |
else: | |
print( | |
f"\n\nScript appears to have completed successfully. You can now import this file into Zotero (make sure Better BibTeX is already installed): \n\t{str(modified_lib)}" | |
) | |
return 0 | |
def _cli(): | |
parser = argparse.ArgumentParser( | |
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS | |
) | |
parser.add_argument("-p", "--papers", help="Path to Papers3 Library") | |
parser.add_argument("-b", "--bibtex", help="Path to the BibTeX export") | |
args = parser.parse_args() | |
return vars(args) | |
if __name__ == "__main__": | |
sys.exit(main(**_cli())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Wow. 5 years is a much longer lifespan for this than I expected. Glad it was useful!