Created
June 11, 2017 15:54
-
-
Save jacopofar/a190eaa4e2d03ed572d9e22c1d2da6a0 to your computer and use it in GitHub Desktop.
Process OpenSubtitles2012 dataset and produce a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gzip | |
import xml.sax | |
import csv | |
subs_dir = 'OpenSubtitles2012' | |
current_file = '' | |
current_movie_id = '???' | |
current_year = '???' | |
with open('subs.csv', "w") as tsvfile: | |
tsv_writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['movie_id','year','utterance_id','text']) | |
tsv_writer.writeheader() | |
class SubtitleFileHandler(xml.sax.ContentHandler): | |
def __init__(self): | |
super().__init__() | |
self.current_id = '0' | |
def characters(self, content): | |
if len(content.strip()) > 0: | |
tsv_writer.writerow({ | |
'movie_id': current_movie_id, | |
'year': current_year, | |
'utterance_id': self.current_id, | |
'text': content.strip() | |
}) | |
def endElement(self, name): | |
pass | |
def startElement(self, name, attrs): | |
if name == 's': | |
self.current_id = attrs['id'] | |
# traverse root directory, and list directories as dirs and files as files | |
for root, dirs, files in os.walk(subs_dir): | |
path = root.split(os.sep) | |
for index, file in enumerate(files): | |
#print(root, dirs, files) | |
# some movie have more than one subtitles file, keep the first one | |
current_file = os.path.join(root, file) | |
if index > 0: | |
print(f'skipping {current_file} because another file for the same movie was examined') | |
if '.DS_Store' in current_file: | |
continue | |
print(current_file) | |
fparts = current_file.split('/') | |
current_movie_id = fparts[-2] | |
current_year = fparts[-3] | |
with gzip.open(current_file) as bz_file: | |
parser = xml.sax.make_parser() | |
parser.setContentHandler(SubtitleFileHandler()) | |
try: | |
parser.parse(bz_file) | |
except Exception as e: | |
# some files seem to have errors. There are so few compared to the total that I just skip them | |
print(e) | |
print(f'the error was in file {current_file}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment