jacopofar · June 11, 2017 15:54
diff --git a/subtitle_process.py b/subtitle_process.py
 import os
 import gzip
 import xml.sax
 import csv

 subs_dir = 'OpenSubtitles2012'
 current_file = ''
 current_movie_id = '???'
 current_year = '???'


 with open('subs.csv', "w") as tsvfile:
    tsv_writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['movie_id','year','utterance_id','text'])
    tsv_writer.writeheader()
    class SubtitleFileHandler(xml.sax.ContentHandler):
        def __init__(self):
            super().__init__()
            self.current_id = '0'

        def characters(self, content):
            if len(content.strip()) > 0:
                tsv_writer.writerow({
                    'movie_id': current_movie_id,
                    'year': current_year,
                    'utterance_id': self.current_id,
                    'text': content.strip()
                })

        def endElement(self, name):
                pass

        def startElement(self, name, attrs):
            if name == 's':
                self.current_id = attrs['id']


    # traverse root directory, and list directories as dirs and files as files

    for root, dirs, files in os.walk(subs_dir):
        path = root.split(os.sep)
        for index, file in enumerate(files):
            #print(root, dirs, files)
            # some movie have more than one subtitles file, keep the first one
            current_file = os.path.join(root, file)
            if index > 0:
                print(f'skipping {current_file} because another file for the same movie was examined')
            if '.DS_Store' in current_file:
                continue
            print(current_file)
            fparts = current_file.split('/')
            current_movie_id = fparts[-2]
            current_year = fparts[-3]

            with gzip.open(current_file) as bz_file:
                    parser = xml.sax.make_parser()
                    parser.setContentHandler(SubtitleFileHandler())
                    try:
                        parser.parse(bz_file)
                    except Exception as e:
                        # some files seem to have errors. There are so few compared to the total that I just skip them
                        print(e)
                        print(f'the error was in file {current_file}')
	import os
	import gzip
	import xml.sax
	import csv

	subs_dir = 'OpenSubtitles2012'
	current_file = ''
	current_movie_id = '???'
	current_year = '???'


	with open('subs.csv', "w") as tsvfile:
	tsv_writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['movie_id','year','utterance_id','text'])
	tsv_writer.writeheader()
	class SubtitleFileHandler(xml.sax.ContentHandler):
	def __init__(self):
	super().__init__()
	self.current_id = '0'

	def characters(self, content):
	if len(content.strip()) > 0:
	tsv_writer.writerow({
	'movie_id': current_movie_id,
	'year': current_year,
	'utterance_id': self.current_id,
	'text': content.strip()
	})

	def endElement(self, name):
	pass

	def startElement(self, name, attrs):
	if name == 's':
	self.current_id = attrs['id']


	# traverse root directory, and list directories as dirs and files as files

	for root, dirs, files in os.walk(subs_dir):
	path = root.split(os.sep)
	for index, file in enumerate(files):
	#print(root, dirs, files)
	# some movie have more than one subtitles file, keep the first one
	current_file = os.path.join(root, file)
	if index > 0:
	print(f'skipping {current_file} because another file for the same movie was examined')
	if '.DS_Store' in current_file:
	continue
	print(current_file)
	fparts = current_file.split('/')
	current_movie_id = fparts[-2]
	current_year = fparts[-3]

	with gzip.open(current_file) as bz_file:
	parser = xml.sax.make_parser()
	parser.setContentHandler(SubtitleFileHandler())
	try:
	parser.parse(bz_file)
	except Exception as e:
	# some files seem to have errors. There are so few compared to the total that I just skip them
	print(e)
	print(f'the error was in file {current_file}')