Created
March 31, 2020 02:47
-
-
Save borice/e10e16a609b8fcb04f66c0382caf66e9 to your computer and use it in GitHub Desktop.
Example code for converting to- and from- pairtree and stubbytree directory structures for the HTRC Extracted Features dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Note: depends on `pairtree` package (`pip install pairtree`) | |
import os | |
import pairtree.pairtree_path as ppath | |
def stubby_to_pairtree(path: str, ef_ext: str = '.json.bz2') -> str: | |
assert path.endswith(ef_ext) | |
d, f = os.path.split(path) | |
assert len(d.split(os.sep)) >= 2 | |
lib_id, clean_volid = f[:-len(ef_ext)].split('.', 1) | |
volid = ppath.id_decode(clean_volid) | |
pairtree_root = os.path.join(os.sep.join(d.split(os.sep)[:-2]), lib_id, 'pairtree_root') | |
pairtree_dir = ppath.id_to_dirpath(volid, pairtree_root) | |
return os.path.join(pairtree_dir, clean_volid, f) | |
def pairtree_to_stubby(path: str, ef_ext: str = '.json.bz2') -> str: | |
assert path.endswith(ef_ext) | |
assert 'pairtree_root' in path | |
d, f = os.path.split(path) | |
lib_id, clean_volid = f[:-len(ef_ext)].split('.', 1) | |
root = d[:d.find(lib_id + os.sep + 'pairtree_root' + os.sep)] | |
stubby_path = os.path.join(lib_id, clean_volid[::3]) | |
return os.path.join(root, stubby_path, f) | |
s = 'loc/a+30795/loc.ark+=13960=t70v90g5f.json.bz2' | |
p = 'loc/pairtree_root/ar/k+/=1/39/60/=t/70/v9/0g/5f/ark+=13960=t70v90g5f/loc.ark+=13960=t70v90g5f.json.bz2' | |
pair = stubby_to_pairtree(s) | |
print(pair) | |
assert pair == p | |
stubby = pairtree_to_stubby(p) | |
print(stubby) | |
assert stubby == s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment