Created
December 23, 2024 04:55
-
-
Save mim-Armand/edec604c6b0cab54e3ccc4c451680a09 to your computer and use it in GitHub Desktop.
txt to word-per-line
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
# Configurable variables | |
DEFAULT_INPUT_FILE = "001_001.txt" | |
DEFAULT_OUTPUT_FILE = "unique_words_001_001.txt" | |
def extract_unique_words(input_file, output_file): | |
try: | |
# Read the content of the file | |
with open(input_file, 'r') as f: | |
text = f.read() | |
# Extract unique words | |
words = text.split() | |
unique_words = sorted(set(word.strip(",.!?\"'()[]{}").lower() for word in words)) | |
# Write unique words to the output file, one per line | |
with open(output_file, 'w') as f: | |
f.write("\n".join(unique_words)) | |
print(f"Unique words have been written to {output_file}") | |
except FileNotFoundError: | |
print(f"Error: The file {input_file} does not exist.") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
# Use command-line arguments if provided, otherwise fall back to defaults | |
input_file = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_INPUT_FILE | |
output_file = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_OUTPUT_FILE | |
extract_unique_words(input_file, output_file) | |
# Run eitherway: | |
# python txt_to_wordsfile.py | |
# python txt_to_wordsfile.py path/to/input.txt path/to/output.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment