Last active
August 29, 2015 14:11
-
-
Save sirex/c38cdd7f4b29ff817853 to your computer and use it in GitHub Desktop.
lmdb proper name parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
LMDB parser and proper name filter. | |
This script uses LMDB database [1] to extract all proper names. | |
[1] http://donelaitis.vdu.lt/~vytas/lmdb/ | |
Examples | |
-------- | |
$ wget http://donelaitis.vdu.lt/\~vytas/lmdb/lmdb.txt -qO- | ./propernames.py | |
""" | |
import io | |
import sys | |
import argparse | |
import contextlib | |
import collections | |
POS = 3 | |
NOUN = 1 | |
NOUN_PROPERNESS = 4 | |
NOUN_PROPERNESS_NAME = 2 | |
Lexeme = collections.namedtuple('Lexeme', 'lexeme, source, lemma, pos, fields') | |
@contextlib.contextmanager | |
def stream_lines(filename, encoding): | |
if filename == '-': | |
yield io.TextIOWrapper(sys.stdin.buffer, encoding=encoding) | |
else: | |
with open(filename, encoding=encoding) as stream: | |
yield stream | |
def parse_lexeme(line): | |
fields = line.split() | |
fields[POS:] = map(int, fields[POS:]) | |
lexeme, source, lemma, pos = fields[:POS+1] | |
return Lexeme(lexeme, source, lemma, pos, fields) | |
def query_proper_names(lines): | |
for line in lines: | |
lexeme = parse_lexeme(line) | |
is_proper_name = ( | |
lexeme.pos == NOUN and | |
lexeme.fields[NOUN_PROPERNESS] == NOUN_PROPERNESS_NAME | |
) | |
if is_proper_name: | |
yield lexeme | |
def main(): | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter | |
) | |
parser.add_argument('stream', nargs='?', default='-') | |
parser.add_argument('-e', '--encoding', default='ISO-8859-13') | |
args = parser.parse_args() | |
with stream_lines(args.stream, args.encoding) as lines: | |
for lexeme in query_proper_names(lines): | |
print(lexeme.lexeme) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment