Skip to content

Instantly share code, notes, and snippets.

@sirex
Last active August 29, 2015 14:11
Show Gist options
  • Save sirex/c38cdd7f4b29ff817853 to your computer and use it in GitHub Desktop.
Save sirex/c38cdd7f4b29ff817853 to your computer and use it in GitHub Desktop.
lmdb proper name parser
#!/usr/bin/env python3
"""
LMDB parser and proper name filter.
This script uses LMDB database [1] to extract all proper names.
[1] http://donelaitis.vdu.lt/~vytas/lmdb/
Examples
--------
$ wget http://donelaitis.vdu.lt/\~vytas/lmdb/lmdb.txt -qO- | ./propernames.py
"""
import io
import sys
import argparse
import contextlib
import collections
POS = 3
NOUN = 1
NOUN_PROPERNESS = 4
NOUN_PROPERNESS_NAME = 2
Lexeme = collections.namedtuple('Lexeme', 'lexeme, source, lemma, pos, fields')
@contextlib.contextmanager
def stream_lines(filename, encoding):
if filename == '-':
yield io.TextIOWrapper(sys.stdin.buffer, encoding=encoding)
else:
with open(filename, encoding=encoding) as stream:
yield stream
def parse_lexeme(line):
fields = line.split()
fields[POS:] = map(int, fields[POS:])
lexeme, source, lemma, pos = fields[:POS+1]
return Lexeme(lexeme, source, lemma, pos, fields)
def query_proper_names(lines):
for line in lines:
lexeme = parse_lexeme(line)
is_proper_name = (
lexeme.pos == NOUN and
lexeme.fields[NOUN_PROPERNESS] == NOUN_PROPERNESS_NAME
)
if is_proper_name:
yield lexeme
def main():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('stream', nargs='?', default='-')
parser.add_argument('-e', '--encoding', default='ISO-8859-13')
args = parser.parse_args()
with stream_lines(args.stream, args.encoding) as lines:
for lexeme in query_proper_names(lines):
print(lexeme.lexeme)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment