arne-cl · December 11, 2013 16:15 · arne-cl · Oct 31, 2014
diff --git a/tigerxml2txt.py b/tigerxml2txt.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Arne Neumann
 #
 # Purpose: extracts sentences from a Tiger XML input file and writes
 #          them to an output file (one word per line with an empty line
 #          between sentences).

 import sys
 import codecs
 from lxml import etree

 if __name__ == '__main__':
    if len(sys.argv) != 3:
        print "Usage: {0} tiger_input.xml plain_output.txt".format(sys.argv[0])
        sys.exit(1)
    else:
        input_file_path = sys.argv[1]
        output_file_path = sys.argv[2]
        tree = etree.parse(input_file_path)
        with codecs.open(output_file_path, 'w', 'utf8') as output_file:
            for sent in tree.iterfind('//s'):
                for token in sent.iterfind('./graph/terminals/t'):
                    output_file.write(token.attrib['word']+'\n')
                output_file.write('\n')
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Author: Arne Neumann
	#
	# Purpose: extracts sentences from a Tiger XML input file and writes
	# them to an output file (one word per line with an empty line
	# between sentences).

	import sys
	import codecs
	from lxml import etree

	if __name__ == '__main__':
	if len(sys.argv) != 3:
	print "Usage: {0} tiger_input.xml plain_output.txt".format(sys.argv[0])
	sys.exit(1)
	else:
	input_file_path = sys.argv[1]
	output_file_path = sys.argv[2]
	tree = etree.parse(input_file_path)
	with codecs.open(output_file_path, 'w', 'utf8') as output_file:
	for sent in tree.iterfind('//s'):
	for token in sent.iterfind('./graph/terminals/t'):
	output_file.write(token.attrib['word']+'\n')
	output_file.write('\n')