Last active
March 2, 2021 12:20
-
-
Save jRimbault/6dd409a92a99e0c4f4b724e2bfcca384 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import csv | |
import os | |
import sys | |
from pathlib import Path | |
def main(args): | |
csv_walker = ( | |
path.absolute() | |
for path in ( | |
Path(os.path.join(path, file)) | |
for path, _, files in os.walk(args.directory) | |
for file in files | |
) | |
if path.suffix == ".csv" | |
) | |
errors = ( | |
(path, line_number, line) | |
for path in csv_walker | |
for line_number, line in collect_errors(path) | |
) | |
with open(args.output, "w", newline="") as out: | |
writer = csv.writer( | |
out, dialect="excel", delimiter=";", quoting=csv.QUOTE_NONNUMERIC | |
) | |
writer.writerow(("filename", "line", "content")) | |
writer.writerows(errors) | |
def collect_errors(path): | |
with open(path) as file: | |
for i, line in enumerate(file): | |
if i == 0: | |
continue # skips header line | |
if "?" in line: | |
yield i + 1, line.rstrip() | |
def parse_args(argv): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("directory", help="directory with the CSVs", default=os.curdir) | |
parser.add_argument("output", help="output file", default="results.csv", nargs="?") | |
return parser.parse_args(argv) | |
if __name__ == "__main__": | |
main(parse_args(sys.argv[1:])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import csv | |
import sys | |
from collections import defaultdict | |
from pathlib import Path | |
def main(args): | |
corrections = get_corrections(args.fixed_csv) | |
for filename, fixes in corrections.items(): | |
fix_csv(filename, fixes) | |
def get_corrections(path): | |
with open(path) as file: | |
reader = csv.DictReader( | |
file, dialect="excel", delimiter=";", quoting=csv.QUOTE_NONNUMERIC | |
) | |
return make_corrections_bag( | |
(row["filename"], int(row["line"]), row["content"]) for row in reader | |
) | |
def make_corrections_bag(iterable): | |
bag = defaultdict(dict) | |
for file, line_number, line in iterable: | |
bag[file][line_number - 1] = line | |
return bag | |
def fix_csv(path, fixes): | |
path = Path(path) | |
new_path = path.with_suffix(".fixed.csv") | |
print(f"Fixing {path.name} in {new_path.name}") | |
with open(path) as file, open(new_path, "w") as output: | |
for i, line in enumerate(file): | |
print(fixes.get(i, line).rstrip(), file=output) | |
def parse_args(argv): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("fixed_csv", help="file with the correction") | |
return parser.parse_args(argv) | |
if __name__ == "__main__": | |
main(parse_args(sys.argv[1:])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment