Created
January 17, 2022 16:17
-
-
Save martinpi/52fe006e33adc06ab233b4d300dcb2ec to your computer and use it in GitHub Desktop.
A cleaning script for Gutenberg texts. Cleans a whole folder full of text files. Pretty rough around the edges but seems to work.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gutenberg_cleaner import * | |
import getopt | |
import sys | |
import os | |
from pathlib import Path | |
argumentList = sys.argv[1:] | |
options = "hi:o:s" | |
long_options = ["help", "input", "output", "strip"] | |
input_dir = "." | |
output_filename = "output.txt" | |
final_content = "" | |
strip = False | |
try: | |
# Parsing argument | |
arguments, values = getopt.getopt(argumentList, options, long_options) | |
# checking each argument | |
for currentArgument, currentValue in arguments: | |
if currentArgument in ("-h", "--help"): | |
print("Displaying Help") | |
elif currentArgument in ("-i", "--input"): | |
print("Reading from:", currentValue) | |
input_dir = currentValue | |
elif currentArgument in ("-o", "--output"): | |
print("Writing to: ", currentValue) | |
output_filename = currentValue | |
elif currentArgument in ("-s", "--strip"): | |
print("Stripping newlines") | |
strip = True | |
except getopt.error as err: | |
# output error, and return with an error code | |
print(str(err)) | |
entries = Path(input_dir) | |
for entry in entries.iterdir(): | |
if not entry.name.endswith(".txt"): | |
continue | |
print("Processing file: ", entry.name) | |
with entry.open('r') as input_file: | |
cleaned_content = super_cleaner(input_file.read()) | |
for line in cleaned_content.splitlines(): | |
if not line.startswith('[deleted]'): | |
if strip: | |
stripped = line.rstrip() | |
if len(stripped) == 0: | |
final_content += "\n" | |
else: | |
final_content += stripped + " " | |
else: | |
final_content += line + "\n" | |
# super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str | |
with open(output_filename, "w") as output_file: | |
output_file.write(final_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment