Created
April 25, 2024 11:24
-
-
Save sagorbrur/8c8c16d29bb0806753505ad0c13ca049 to your computer and use it in GitHub Desktop.
Rename files to hf shards format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given list of filenames | |
file_names = ['chunk_1.jsonl', 'chunk_2.jsonl'] | |
# Function to convert file names to the required format | |
def convert_filenames(filenames): | |
total_files = len(filenames) | |
new_file_names = [] | |
for i, filename in enumerate(filenames, start=1): | |
# Extract the base name without extension and the chunk number | |
base_name, _ = filename.split('_') | |
extension = filename.split('.')[-1] | |
# Create the new filename using the zero-padded format | |
new_name = f"{base_name}-{i:05d}-of-{total_files:05d}.{extension}" | |
new_file_names.append(new_name) | |
return new_file_names | |
# Convert the filenames | |
converted_file_names = convert_filenames(file_names) | |
print(converted_file_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment