Last active
December 20, 2015 08:49
-
-
Save andreasvc/6103307 to your computer and use it in GitHub Desktop.
Match lines in one file with those of another, and produce line numbers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Match lines in one file with those of another, | |
and produce line numbers. """ | |
import io | |
import sys | |
USAGE = """Match lines in one file with those of another, and get line numbers. | |
usage: python %s sents text output | |
where sents and text are files with one sentence per line. | |
The result will be of the form "1|line", written to file "output". | |
Everything is assumed to be encoded with UTF-8.""" % sys.argv[0] | |
def mangle(line): | |
""" Strip spaces, capitalization & special characters for matching. """ | |
return line.replace(' ', '').lower().encode('ascii', 'ignore') | |
def findsentnums(sentsfile, bookfile, outfile): | |
""" Go through lines of book and report line numbers of lines in sents. """ | |
sents = {mangle(line): ('XXX', line) for line in io.open( | |
sentsfile, encoding='utf8').read().split('\n') if line.strip()} | |
book = {mangle(line): (n + 1, line) for n, line in enumerate(io.open( | |
bookfile, encoding='utf8').read().split('\n'))} | |
matches = [book[mangled] for mangled in set(sents) & set(book)] | |
unmatched = [sents[mangled] for mangled in set(sents) - set(book)] | |
with io.open(outfile, 'w', encoding='utf8') as out: | |
out.writelines('%s|%s\n' % nl for nl in sorted(matches + unmatched)) | |
def main(): | |
""" Command line interface. """ | |
try: | |
findsentnums(*sys.argv[1:]) | |
except TypeError: | |
print(USAGE) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment