Skip to content

Instantly share code, notes, and snippets.

@liquidcarbon
Created June 29, 2020 23:11
Show Gist options
  • Save liquidcarbon/ada19cad1b2edb19052accda06799c2d to your computer and use it in GitHub Desktop.
Save liquidcarbon/ada19cad1b2edb19052accda06799c2d to your computer and use it in GitHub Desktop.
Remove newline and caret return characters to fix broken lines in a large data export
import sys
def fix(file, sep, nf, output):
"""Checks and fixes prematurely terminated lines in a tabular file.
:param file: input file
:param sep: delimiter or its ASCII **octal** code
:param nf: expected number of fields
:param output: output file
:return: None
"""
# print('delimiter is: {}!'.format(sep))
with open(file,'r') as f:
broken = f.readlines()
# init list of repaired lines and the line being fixed
repaired = []
fixed = ''
i = 0
for l in broken:
i += 1
_nf = fixed.count(sep)
if l == '\n':
continue
if _nf >= (nf-1):
# repaired.append('FIXED' + fixed[5:] + '\n')
repaired.append(fixed + '\n')
fixed = ''
# remove caret returns, check for the number of separators
if l.count(sep) < (nf-1):
fixed += l.replace('\r','')[:-1]
else:
repaired.append(l.replace('\r',''))
if i % 50000 == 0:
print(i)
# if i in range(66799,66804):
# print('i: {} \t fixed: {} \t _nf: {}'.format(i,fixed,_nf))
# print('\n')
# with open('sample_o020_fixed.txt', 'w') as f:
with open(output, 'w') as f:
f.writelines(repaired)
if __name__ == '__main__':
file = sys.argv[1]
try:
sep = chr(int(sys.argv[2]))
except TypeError:
sep = sys.argv[2]
nf = int(sys.argv[3])
output = sys.argv[4]
fix(file, sep, nf, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment