Last active
January 20, 2018 11:17
-
-
Save luerhard/a5e2ade44e75e9a0a06216d682f723c0 to your computer and use it in GitHub Desktop.
A SqliteCorpusReader for nltk. To use it, instantiate a SqliteCorpusReader-Instance with keyword-arguments dbpath, table and field. to directly access other columns, add methods like SqliteCorpusReader.timestamps or .articles oder .folder. It recognizes the DB-Structure automatically.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 as sq | |
from nltk.data import LazyLoader | |
from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation | |
from nltk.tokenize import WordPunctTokenizer, sent_tokenize | |
from sqlalchemy.ext.automap import automap_base | |
from sqlalchemy.orm import Session | |
from sqlalchemy import create_engine | |
from sqlalchemy.inspection import inspect | |
from sqlalchemy import func | |
class SqliteAbstractLazySequence(AbstractLazySequence): | |
def __init__(self, dbpath='path/to/sqlitedb.db', table='name_of_table', field='name_of_content_column'): | |
self.Base = automap_base() | |
self.engine = create_engine('sqlite:///' + dbpath) | |
self.Base.prepare(self.engine, reflect = True) | |
self.session = Session(self.engine) | |
self.table = eval("self.Base.classes." + table) | |
self.field = field | |
self.key = inspect(self.table).primary_key[0].name | |
def __len__(self): | |
return self.session.query(func.count(eval("self.table." + self.key))).scalar() | |
class TextSequence(SqliteAbstractLazySequence): | |
def iterate_from(self, start=0): | |
f = lambda d: eval("d." + self.field) | |
return iter(LazyMap(f, self.session.query(self.table).filter(eval("self.table." + self.key) > start).all())) | |
class PropertySequence(SqliteAbstractLazySequence): | |
def __init__(self, *args, **kwargs): | |
self.column = kwargs['column'] | |
del kwargs['column'] | |
super().__init__(*args, **kwargs) | |
def iterate_from(self, start=0): | |
"""Set column to return. If none, a SqlAlchemy-object with all columns is returned.""" | |
if self.column: | |
f = lambda d: eval("d." + str(self.column)) | |
else: | |
f = lambda d: d | |
return iter(LazyMap(f, self.session.query(self.table).filter(eval("self.table." + self.key) > start).all())) | |
class SqliteCorpusReader(object): | |
def __init__(self, | |
word_tokenizer = WordPunctTokenizer(), | |
sent_tokenizer=sent_tokenize, **kwargs): | |
self._seq = TextSequence(**kwargs) | |
self._kwargs = kwargs | |
self._word_tokenize = word_tokenizer.tokenize | |
self._sent_tokenize = sent_tokenizer | |
def text(self): | |
return self._seq | |
def _property(self, column=None): | |
self._kwargs['column'] = column | |
return PropertySequence(**self._kwargs) | |
def words(self): | |
return LazyConcatenation(LazyMap(self._word_tokenize, self.text())) | |
def sents(self): | |
return LazyConcatenation(LazyMap(self._sent_tokenize, self.text())) | |
def articles(self): | |
return self._property(column='content') | |
def timestamps(self): | |
return self._property(column='published') | |
def folder(self): | |
return self._property(column='folder') | |
def objects(self): | |
return self._property(column=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment