Skip to content

Instantly share code, notes, and snippets.

@jacopofar
Created June 11, 2017 15:54
Show Gist options
  • Save jacopofar/a190eaa4e2d03ed572d9e22c1d2da6a0 to your computer and use it in GitHub Desktop.
Save jacopofar/a190eaa4e2d03ed572d9e22c1d2da6a0 to your computer and use it in GitHub Desktop.
Process OpenSubtitles2012 dataset and produce a CSV file
import os
import gzip
import xml.sax
import csv
subs_dir = 'OpenSubtitles2012'
current_file = ''
current_movie_id = '???'
current_year = '???'
with open('subs.csv', "w") as tsvfile:
tsv_writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['movie_id','year','utterance_id','text'])
tsv_writer.writeheader()
class SubtitleFileHandler(xml.sax.ContentHandler):
def __init__(self):
super().__init__()
self.current_id = '0'
def characters(self, content):
if len(content.strip()) > 0:
tsv_writer.writerow({
'movie_id': current_movie_id,
'year': current_year,
'utterance_id': self.current_id,
'text': content.strip()
})
def endElement(self, name):
pass
def startElement(self, name, attrs):
if name == 's':
self.current_id = attrs['id']
# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk(subs_dir):
path = root.split(os.sep)
for index, file in enumerate(files):
#print(root, dirs, files)
# some movie have more than one subtitles file, keep the first one
current_file = os.path.join(root, file)
if index > 0:
print(f'skipping {current_file} because another file for the same movie was examined')
if '.DS_Store' in current_file:
continue
print(current_file)
fparts = current_file.split('/')
current_movie_id = fparts[-2]
current_year = fparts[-3]
with gzip.open(current_file) as bz_file:
parser = xml.sax.make_parser()
parser.setContentHandler(SubtitleFileHandler())
try:
parser.parse(bz_file)
except Exception as e:
# some files seem to have errors. There are so few compared to the total that I just skip them
print(e)
print(f'the error was in file {current_file}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment