Last active
January 3, 2018 20:14
-
-
Save johnludwigm/205ebd6a9e556baa0f6b2462dd380ef0 to your computer and use it in GitHub Desktop.
Explains and solves IMDb tsv file problem.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
def getmaxsize(): | |
"""Returns max field size.""" | |
return csv.field_size_limit(sys.maxsize) | |
def setmaxsize(size = 500 * 1024 * 1024): | |
"""Sets max field size as high as possible.""" | |
csv.field_size_limit(size) | |
def fixrow(row, delimiter = "\t"): | |
"""Fixes row that did not split properly on the delimiter. Call with list or tuple.""" | |
for piece in row: | |
try: | |
yield from piece.split("\t") | |
except AttributeError: | |
#Can't split None. | |
yield piece | |
def itertsv(inputtsv, delimiter = "\t", encoding = "utf-8", newline = "", nullchar = "\\N", numentries = 9, omitheader = True): | |
"""Generator that iterates through tsv file, yielding the tsv's rows with None substituting the nullchar.""" | |
with open(inputtsv, "r", encoding = encoding, newline = newline) as tsvread: | |
reader = csv.reader(tsvread, delimiter = delimiter) | |
if omitheader: | |
header = next(reader) | |
for row in reader: | |
tuprow = tuple(entry if entry != nullchar else None for entry in row) | |
if len(tuprow) == numentries: | |
yield tuprow | |
else: | |
yield tuple(fixrow(tuprow)) | |
def findbadtsv(inputtsv, delimiter = "\t", encoding = "utf-8", newline = "", nullchar = "\\N", numentries = 9, omitheader = True): | |
"""Yields the rows without the correct number of entries.""" | |
with open(inputtsv, "r", encoding = encoding, newline = newline) as tsvread: | |
reader = csv.reader(tsvread, delimiter = delimiter) | |
if omitheader: | |
header = next(reader) | |
for row in reader: | |
tuprow = tuple(entry if entry != nullchar else None for entry in row) | |
if len(tuprow) != numentries: | |
yield tuprow | |
setmaxsize() | |
for row in baditertsv("/.../title.basics.tsv.gz.csv"): | |
print(row) | |
""" | |
The troublemakers were: | |
('tt2347742', 'tvEpisode', 'No sufras por la alergia esta primavera\tNo sufras por la alergia esta primavera', '0', '2004', 'NULL', 'NULL', 'NULL') | |
('tt2409954', 'tvEpisode', 'A Corrida:\tA Corrida:', '0', '2011', 'NULL', 'NULL', 'Comedy') | |
('tt7321230', 'tvEpisode', "1st Guest Bedroom Takes It's Toll\t1st Guest Bedroom Takes It's Toll", '0', '2017', 'NULL', 'NULL', 'Game-Show,Reality-TV') | |
('tt7406458', 'tvEpisode', "Happy New Year! This New Year's Tournament Is Where We Get Serious!\tHappy New Year! This New Year's Tournament Is Where We Get Serious!", '0', '2016', 'NULL', 'NULL', 'Action,Adventure,Animation') | |
Calling itertsv on the file yields corrected rows. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment