Skip to content

Instantly share code, notes, and snippets.

@johnludwigm
Last active January 3, 2018 20:14
Show Gist options
  • Save johnludwigm/205ebd6a9e556baa0f6b2462dd380ef0 to your computer and use it in GitHub Desktop.
Save johnludwigm/205ebd6a9e556baa0f6b2462dd380ef0 to your computer and use it in GitHub Desktop.
Explains and solves IMDb tsv file problem.
import csv
import sys
def getmaxsize():
"""Returns max field size."""
return csv.field_size_limit(sys.maxsize)
def setmaxsize(size = 500 * 1024 * 1024):
"""Sets max field size as high as possible."""
csv.field_size_limit(size)
def fixrow(row, delimiter = "\t"):
"""Fixes row that did not split properly on the delimiter. Call with list or tuple."""
for piece in row:
try:
yield from piece.split("\t")
except AttributeError:
#Can't split None.
yield piece
def itertsv(inputtsv, delimiter = "\t", encoding = "utf-8", newline = "", nullchar = "\\N", numentries = 9, omitheader = True):
"""Generator that iterates through tsv file, yielding the tsv's rows with None substituting the nullchar."""
with open(inputtsv, "r", encoding = encoding, newline = newline) as tsvread:
reader = csv.reader(tsvread, delimiter = delimiter)
if omitheader:
header = next(reader)
for row in reader:
tuprow = tuple(entry if entry != nullchar else None for entry in row)
if len(tuprow) == numentries:
yield tuprow
else:
yield tuple(fixrow(tuprow))
def findbadtsv(inputtsv, delimiter = "\t", encoding = "utf-8", newline = "", nullchar = "\\N", numentries = 9, omitheader = True):
"""Yields the rows without the correct number of entries."""
with open(inputtsv, "r", encoding = encoding, newline = newline) as tsvread:
reader = csv.reader(tsvread, delimiter = delimiter)
if omitheader:
header = next(reader)
for row in reader:
tuprow = tuple(entry if entry != nullchar else None for entry in row)
if len(tuprow) != numentries:
yield tuprow
setmaxsize()
for row in baditertsv("/.../title.basics.tsv.gz.csv"):
print(row)
"""
The troublemakers were:
('tt2347742', 'tvEpisode', 'No sufras por la alergia esta primavera\tNo sufras por la alergia esta primavera', '0', '2004', 'NULL', 'NULL', 'NULL')
('tt2409954', 'tvEpisode', 'A Corrida:\tA Corrida:', '0', '2011', 'NULL', 'NULL', 'Comedy')
('tt7321230', 'tvEpisode', "1st Guest Bedroom Takes It's Toll\t1st Guest Bedroom Takes It's Toll", '0', '2017', 'NULL', 'NULL', 'Game-Show,Reality-TV')
('tt7406458', 'tvEpisode', "Happy New Year! This New Year's Tournament Is Where We Get Serious!\tHappy New Year! This New Year's Tournament Is Where We Get Serious!", '0', '2016', 'NULL', 'NULL', 'Action,Adventure,Animation')
Calling itertsv on the file yields corrected rows.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment