Created
June 29, 2020 23:11
-
-
Save liquidcarbon/ada19cad1b2edb19052accda06799c2d to your computer and use it in GitHub Desktop.
Remove newline and caret return characters to fix broken lines in a large data export
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def fix(file, sep, nf, output): | |
"""Checks and fixes prematurely terminated lines in a tabular file. | |
:param file: input file | |
:param sep: delimiter or its ASCII **octal** code | |
:param nf: expected number of fields | |
:param output: output file | |
:return: None | |
""" | |
# print('delimiter is: {}!'.format(sep)) | |
with open(file,'r') as f: | |
broken = f.readlines() | |
# init list of repaired lines and the line being fixed | |
repaired = [] | |
fixed = '' | |
i = 0 | |
for l in broken: | |
i += 1 | |
_nf = fixed.count(sep) | |
if l == '\n': | |
continue | |
if _nf >= (nf-1): | |
# repaired.append('FIXED' + fixed[5:] + '\n') | |
repaired.append(fixed + '\n') | |
fixed = '' | |
# remove caret returns, check for the number of separators | |
if l.count(sep) < (nf-1): | |
fixed += l.replace('\r','')[:-1] | |
else: | |
repaired.append(l.replace('\r','')) | |
if i % 50000 == 0: | |
print(i) | |
# if i in range(66799,66804): | |
# print('i: {} \t fixed: {} \t _nf: {}'.format(i,fixed,_nf)) | |
# print('\n') | |
# with open('sample_o020_fixed.txt', 'w') as f: | |
with open(output, 'w') as f: | |
f.writelines(repaired) | |
if __name__ == '__main__': | |
file = sys.argv[1] | |
try: | |
sep = chr(int(sys.argv[2])) | |
except TypeError: | |
sep = sys.argv[2] | |
nf = int(sys.argv[3]) | |
output = sys.argv[4] | |
fix(file, sep, nf, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment