tkojima0107 · June 3, 2025 08:40
diff --git a/rename_cps.py b/rename_cps.py
 import json
 import re
 import sys
 import os
 from argparse import ArgumentParser,SUPPRESS
 from pathvalidate import sanitize_filename
 import shutil

 START_STR = "var webpub = {data: "

 HELP_MSG="""
 The IEEE Conference Publishing Services (CPS) provides Proceedings in a webpub format which is fine for viewing in a browser. However, accessing them through any other means is inconvenient as the file names appear hashed. This script organizes the file names and directories to make them more readable.

 Prerequisites:
    This script should be run with Python 3.
    It uses a package named 'pathvalidate' for checking file names.
    Please install it by running: $ pip3 install pathvalidate.

 Usage:
    Execute this script in the directory where 'index.html' is located.
    Before running, create a destination directory for the output.
    Run the script using: $ python3 rename.py data/data.js export
    The first argument specifies the JavaScript file containing the proceedings information, which is likely in 'data/data.js'.
    The second argument is the path to the created destination directory, 'export' in this example.
    The script will save the PDFs, organized by section, in the 'export' directory.
 """

 def parse():
    parser = ArgumentParser(description=HELP_MSG)
    parser.add_argument('data', type=str, help='path to data.js')
    parser.add_argument('export', help='export directory')
    parser.add_argument('--conf-index', type=int, help="Specify the index of the conference to process (default: 0). This is useful if the data file contains multiple conferences.")
    args = parser.parse_args()

    return args

 def copyfile(out_dir, element):
    src = element["articleLocation"]
    dst = out_dir + "/" + sanitize_filename(element["text"]) + ".pdf"
    shutil.copyfile(src, dst)

 if __name__ == '__main__':
    args = parse()

    # open data file
    try:
        with open(args.data, "r") as f:
            json_str = f.read()
    except FileNotFoundError:
        print("No such file:", args.data)
        sys.exit(1)
    except PermissionError:
        print("Permission Error, Cannot open:", args.data)
        sys.exit(1)

    # check export dir
    if not os.path.exists(args.export):
        print("No such directory:", args.export)
        sys.exit(1)
    if not os.path.isdir(args.export):
        print("Not directory:", args.export)
        sys.exit(1)
    if not os.access(args.export, os.W_OK):
        print("Not writable:", args.export)
        sys.exit(1)

    # check data
    m = re.match("^" + START_STR, json_str)
    if m is None:
        print(args.data, "is not valid data file of CPS proceedings")
        sys.exit(1)

    # make readable as json
    json_str = re.sub("^" + START_STR, "", json_str)
    json_str = re.sub("};$", "", json_str)

    # load json
    data = json.loads(json_str)
    if len(data["conferences"]) > 1:
        if args.conf_index is None:
            print("Warning: the data contains multiple conference data")
            print("Defaulting to the first conference data.")
            print("Please specify the index of the conference to process using --conf-index")
        print("This script use the first conference data")
    if args.conf_index is not None:
        conf_index = args.conf_index
    else:
        conf_index = 0
    proc = data["conferences"][conf_index]

    title = proc["title"]
    print("Title:", title)

    # frontMatter
    if "frontMatter" in proc.keys():
        os.mkdir(args.export + "/frontMatter")
        for l in proc["frontMatter"]:
            copyfile(args.export + "/frontMatter", l)

    # backMatter
    if "backMatter" in proc.keys():
        os.mkdir(args.export + "/backMatter")
        for l in proc["backMatter"]:
            copyfile(args.export + "/backMatter", l)

    # each sections
    if "backMatter" in proc.keys():
        for sec in proc["sections"]:
            sec_title = sanitize_filename(sec["title"])
            dir_name = args.export + "/" + sec_title
            # check if the dir is already exist
            if not os.path.exists(dir_name):
                os.mkdir(args.export + "/" + sec_title)
            elif not os.path.isdir(dir_name):
                print(f"Error: {dir_name} is already exist but not a directory.")
                sys.exit(1)
            for l in sec["lineItems"]:
                copyfile(args.export + "/" + sec_title, l)
	import json
	import re
	import sys
	import os
	from argparse import ArgumentParser,SUPPRESS
	from pathvalidate import sanitize_filename
	import shutil

	START_STR = "var webpub = {data: "

	HELP_MSG="""
	The IEEE Conference Publishing Services (CPS) provides Proceedings in a webpub format which is fine for viewing in a browser. However, accessing them through any other means is inconvenient as the file names appear hashed. This script organizes the file names and directories to make them more readable.

	Prerequisites:
	This script should be run with Python 3.
	It uses a package named 'pathvalidate' for checking file names.
	Please install it by running: $ pip3 install pathvalidate.

	Usage:
	Execute this script in the directory where 'index.html' is located.
	Before running, create a destination directory for the output.
	Run the script using: $ python3 rename.py data/data.js export
	The first argument specifies the JavaScript file containing the proceedings information, which is likely in 'data/data.js'.
	The second argument is the path to the created destination directory, 'export' in this example.
	The script will save the PDFs, organized by section, in the 'export' directory.
	"""

	def parse():
	parser = ArgumentParser(description=HELP_MSG)
	parser.add_argument('data', type=str, help='path to data.js')
	parser.add_argument('export', help='export directory')
	parser.add_argument('--conf-index', type=int, help="Specify the index of the conference to process (default: 0). This is useful if the data file contains multiple conferences.")
	args = parser.parse_args()

	return args

	def copyfile(out_dir, element):
	src = element["articleLocation"]
	dst = out_dir + "/" + sanitize_filename(element["text"]) + ".pdf"
	shutil.copyfile(src, dst)

	if __name__ == '__main__':
	args = parse()

	# open data file
	try:
	with open(args.data, "r") as f:
	json_str = f.read()
	except FileNotFoundError:
	print("No such file:", args.data)
	sys.exit(1)
	except PermissionError:
	print("Permission Error, Cannot open:", args.data)
	sys.exit(1)

	# check export dir
	if not os.path.exists(args.export):
	print("No such directory:", args.export)
	sys.exit(1)
	if not os.path.isdir(args.export):
	print("Not directory:", args.export)
	sys.exit(1)
	if not os.access(args.export, os.W_OK):
	print("Not writable:", args.export)
	sys.exit(1)

	# check data
	m = re.match("^" + START_STR, json_str)
	if m is None:
	print(args.data, "is not valid data file of CPS proceedings")
	sys.exit(1)

	# make readable as json
	json_str = re.sub("^" + START_STR, "", json_str)
	json_str = re.sub("};$", "", json_str)

	# load json
	data = json.loads(json_str)
	if len(data["conferences"]) > 1:
	if args.conf_index is None:
	print("Warning: the data contains multiple conference data")
	print("Defaulting to the first conference data.")
	print("Please specify the index of the conference to process using --conf-index")
	print("This script use the first conference data")
	if args.conf_index is not None:
	conf_index = args.conf_index
	else:
	conf_index = 0
	proc = data["conferences"][conf_index]

	title = proc["title"]
	print("Title:", title)

	# frontMatter
	if "frontMatter" in proc.keys():
	os.mkdir(args.export + "/frontMatter")
	for l in proc["frontMatter"]:
	copyfile(args.export + "/frontMatter", l)

	# backMatter
	if "backMatter" in proc.keys():
	os.mkdir(args.export + "/backMatter")
	for l in proc["backMatter"]:
	copyfile(args.export + "/backMatter", l)

	# each sections
	if "backMatter" in proc.keys():
	for sec in proc["sections"]:
	sec_title = sanitize_filename(sec["title"])
	dir_name = args.export + "/" + sec_title
	# check if the dir is already exist
	if not os.path.exists(dir_name):
	os.mkdir(args.export + "/" + sec_title)
	elif not os.path.isdir(dir_name):
	print(f"Error: {dir_name} is already exist but not a directory.")
	sys.exit(1)
	for l in sec["lineItems"]:
	copyfile(args.export + "/" + sec_title, l)
No results found