Skip to content

Instantly share code, notes, and snippets.

@tkojima0107
Last active June 3, 2025 08:40
Show Gist options
  • Select an option

  • Save tkojima0107/c8ad6e500da2d2fcb9265f6ecb856adb to your computer and use it in GitHub Desktop.

Select an option

Save tkojima0107/c8ad6e500da2d2fcb9265f6ecb856adb to your computer and use it in GitHub Desktop.
A file rename script for IEEE CPS proceeding
import json
import re
import sys
import os
from argparse import ArgumentParser,SUPPRESS
from pathvalidate import sanitize_filename
import shutil
START_STR = "var webpub = {data: "
HELP_MSG="""
The IEEE Conference Publishing Services (CPS) provides Proceedings in a webpub format which is fine for viewing in a browser. However, accessing them through any other means is inconvenient as the file names appear hashed. This script organizes the file names and directories to make them more readable.
Prerequisites:
This script should be run with Python 3.
It uses a package named 'pathvalidate' for checking file names.
Please install it by running: $ pip3 install pathvalidate.
Usage:
Execute this script in the directory where 'index.html' is located.
Before running, create a destination directory for the output.
Run the script using: $ python3 rename.py data/data.js export
The first argument specifies the JavaScript file containing the proceedings information, which is likely in 'data/data.js'.
The second argument is the path to the created destination directory, 'export' in this example.
The script will save the PDFs, organized by section, in the 'export' directory.
"""
def parse():
parser = ArgumentParser(description=HELP_MSG)
parser.add_argument('data', type=str, help='path to data.js')
parser.add_argument('export', help='export directory')
parser.add_argument('--conf-index', type=int, help="Specify the index of the conference to process (default: 0). This is useful if the data file contains multiple conferences.")
args = parser.parse_args()
return args
def copyfile(out_dir, element):
src = element["articleLocation"]
dst = out_dir + "/" + sanitize_filename(element["text"]) + ".pdf"
shutil.copyfile(src, dst)
if __name__ == '__main__':
args = parse()
# open data file
try:
with open(args.data, "r") as f:
json_str = f.read()
except FileNotFoundError:
print("No such file:", args.data)
sys.exit(1)
except PermissionError:
print("Permission Error, Cannot open:", args.data)
sys.exit(1)
# check export dir
if not os.path.exists(args.export):
print("No such directory:", args.export)
sys.exit(1)
if not os.path.isdir(args.export):
print("Not directory:", args.export)
sys.exit(1)
if not os.access(args.export, os.W_OK):
print("Not writable:", args.export)
sys.exit(1)
# check data
m = re.match("^" + START_STR, json_str)
if m is None:
print(args.data, "is not valid data file of CPS proceedings")
sys.exit(1)
# make readable as json
json_str = re.sub("^" + START_STR, "", json_str)
json_str = re.sub("};$", "", json_str)
# load json
data = json.loads(json_str)
if len(data["conferences"]) > 1:
if args.conf_index is None:
print("Warning: the data contains multiple conference data")
print("Defaulting to the first conference data.")
print("Please specify the index of the conference to process using --conf-index")
print("This script use the first conference data")
if args.conf_index is not None:
conf_index = args.conf_index
else:
conf_index = 0
proc = data["conferences"][conf_index]
title = proc["title"]
print("Title:", title)
# frontMatter
if "frontMatter" in proc.keys():
os.mkdir(args.export + "/frontMatter")
for l in proc["frontMatter"]:
copyfile(args.export + "/frontMatter", l)
# backMatter
if "backMatter" in proc.keys():
os.mkdir(args.export + "/backMatter")
for l in proc["backMatter"]:
copyfile(args.export + "/backMatter", l)
# each sections
if "backMatter" in proc.keys():
for sec in proc["sections"]:
sec_title = sanitize_filename(sec["title"])
dir_name = args.export + "/" + sec_title
# check if the dir is already exist
if not os.path.exists(dir_name):
os.mkdir(args.export + "/" + sec_title)
elif not os.path.isdir(dir_name):
print(f"Error: {dir_name} is already exist but not a directory.")
sys.exit(1)
for l in sec["lineItems"]:
copyfile(args.export + "/" + sec_title, l)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment