Last active
February 18, 2021 10:41
-
-
Save andreasvc/2b14a083da41850881ba2c0631b4ee75 to your computer and use it in GitHub Desktop.
Find the longest sequence of tokens in a text without any taboo n-grams
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Given a set of taboo n-grams and a text, find the longest sequence of tokens\n", | |
"without any of the taboo n-grams. An n-gram is represented as a string of\n", | |
"space-separated tokens.\n", | |
"\n", | |
"Usage: longestnontaboo.py [--len] <taboofile> <filenames...>\n", | |
"\n", | |
"Specify taboo n-grams with one n-gram per line in taboofile;\n", | |
"multiple filenames can be specified.\n", | |
"By default the output is the longest non-taboo sequence.\n", | |
"With --len, only the length of this sequence is reported.\n" | |
] | |
} | |
], | |
"source": [ | |
"import longestnontaboo\n", | |
"print(longestnontaboo.__doc__)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"austen-emma.txt: jane fairfax , standing with her back to them , intent on her pianoforte . busy as he was , however , the young man was yet able to shew a most happy countenance on seeing emma again . `` this is a pleasure , '' said he , in rather a low voice , `` coming at least ten minutes earlier than i had calculated . you find me trying to be useful ; tell me if you think i shall succeed . '' `` what ! '' said mrs. weston , `` have not you finished it yet ? you would not earn a very good livelihood as a working silversmith at this rate . '' `` i have not been working uninterruptedly , '' he replied , `` i have been assisting miss fairfax in trying to make her instrument stand steadily , it was not quite firm ; an unevenness in the floor , i believe . you see we have been wedging one leg with paper . this was very kind of you to be persuaded to come . i was almost afraid you would be hurrying home . '' he contrived that she should be seated by him ;\n" | |
] | |
} | |
], | |
"source": [ | |
"longestnontaboo.main('taboowords.txt', 'austen-emma.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"taboo = {'and', 'but', 'or', 'yes', 'no'}\n", | |
"tokens = longestnontaboo.preprocess('austen-emma.txt')\n", | |
"ngrams = longestnontaboo.getngrams(tokens, taboo)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPnklEQVR4nO3df4wc5X3H8fc3NiEVacNPnZBt9dzGamWECugEVImqLahgoKqJRJAjFOzIlf8xUiJZak1TiTYByfmjoUQqSG6wMCgKoPwQKCBRF1hF/YOfgQDGohxghC2DldiQHFFoj3z7xz5HV+bOO2vv7Z7veb+k1c0888zMM1+NPzc7N7uOzESSVIdPjHoAkqThMfQlqSKGviRVxNCXpIoY+pJUkaWjHsDRnHnmmTk+Pt73eu+//z6nnHLK4Ae0yFin5qxVM9apmfmu07PPPvuLzDxrtmULOvTHx8d55pln+l6v3W7TarUGP6BFxjo1Z62asU7NzHedIuLNuZZ5e0eSKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkiqyoD+Re7zGtz40kv3u3XbVSPYrSb14pS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIo1DPyKWRMRzEfGTMr8yIp6MiMmIuC8iPlnaTy7zk2X5eNc2biztr0TE5QM/GknSUfVzpf9VYE/X/LeAWzPzs8BhYGNp3wgcLu23ln5ExGpgHXAOsAa4PSKWHN/wJUn9aBT6EbEcuAr4bpkP4BLgB6XLTuDqMr22zFOWX1r6rwXuzcwPMvMNYBK4cADHIElqqOmV/r8Cfwf8rsyfAbybmdNlfh+wrEwvA94CKMvfK/0/ap9lHUnSEPT8auWI+GvgYGY+GxGt+R5QRGwCNgGMjY3Rbrf73sbU1BTtdpst50737jwPjmXMozBTJ/VmrZqxTs2Msk5Nvk//c8DfRMSVwKeAPwBuA06NiKXlan45sL/03w+sAPZFxFLgM8Avu9pndK/zkczcDmwHmJiYyFar1fdBtdttWq0WG0b1ffrXtUay337N1Em9WatmrFMzo6xTz9s7mXljZi7PzHE6f4h9LDOvAx4Hrind1gMPlOkHyzxl+WOZmaV9XXm6ZyWwCnhqYEciSerpeP7nrL8H7o2Im4HngDtL+53APRExCRyi84uCzNwdEfcDLwPTwObM/PA49i9J6lNfoZ+ZbaBdpl9nlqdvMvO3wBfnWP8W4JZ+BylJGgw/kStJFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFeoZ+RHwqIp6KiJ9HxO6I+OfSvjIinoyIyYi4LyI+WdpPLvOTZfl417ZuLO2vRMTl83ZUkqRZNbnS/wC4JDP/DDgPWBMRFwPfAm7NzM8Ch4GNpf9G4HBpv7X0IyJWA+uAc4A1wO0RsWSAxyJJ6qFn6GfHVJk9qbwSuAT4QWnfCVxdpteWecrySyMiSvu9mflBZr4BTAIXDuIgJEnNNLqnHxFLIuJ54CCwC3gNeDczp0uXfcCyMr0MeAugLH8POKO7fZZ1JElDsLRJp8z8EDgvIk4Ffgz86XwNKCI2AZsAxsbGaLfbfW9jamqKdrvNlnOne3eeB8cy5lGYqZN6s1bNWKdmRlmnRqE/IzPfjYjHgT8HTo2IpeVqfjmwv3TbD6wA9kXEUuAzwC+72md0r9O9j+3AdoCJiYlstVp9HRB0QrfVarFh60N9rzsIe69rjWS//Zqpk3qzVs1Yp2ZGWacmT++cVa7wiYjfA/4K2AM8DlxTuq0HHijTD5Z5yvLHMjNL+7rydM9KYBXw1ICOQ5LUQJMr/bOBneVJm08A92fmTyLiZeDeiLgZeA64s/S/E7gnIiaBQ3Se2CEzd0fE/cDLwDSwudw2kiQNSc/Qz8wXgPNnaX+dWZ6+yczfAl+cY1u3ALf0P0xJ0iD4iVxJqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkWW9uoQESuAu4ExIIHtmXlbRJwO3AeMA3uBazPzcEQEcBtwJfAbYENm/qxsaz3wj2XTN2fmzsEezsIwvvWhkex377arRrJfSSeOJlf608CWzFwNXAxsjojVwFbg0cxcBTxa5gGuAFaV1ybgDoDyS+Im4CLgQuCmiDhtgMciSeqhZ+hn5oGZK/XM/DWwB1gGrAVmrtR3AleX6bXA3dnxBHBqRJwNXA7sysxDmXkY2AWsGeTBSJKOruftnW4RMQ6cDzwJjGXmgbLobTq3f6DzC+GtrtX2lba52o/cxyY67xAYGxuj3W73M0QApqamaLfbbDl3uu91T2T91mqmTurNWjVjnZoZZZ0ah35EfBr4IfC1zPxV59Z9R2ZmROQgBpSZ24HtABMTE9lqtfreRrvdptVqsWFE99ZHZe91rb76z9RJvVmrZqxTM6OsU6OndyLiJDqB/73M/FFpfqfctqH8PFja9wMrulZfXtrmapckDUnP0C9P49wJ7MnMb3ctehBYX6bXAw90tV8fHRcD75XbQI8Al0XEaeUPuJeVNknSkDS5vfM54MvAixHxfGn7B2AbcH9EbATeBK4tyx6m87jmJJ1HNr8CkJmHIuKbwNOl3zcy89AgDkKS1EzP0M/M/wJijsWXztI/gc1zbGsHsKOfAUqSBsdP5EpSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klSRnqEfETsi4mBEvNTVdnpE7IqIV8vP00p7RMR3ImIyIl6IiAu61llf+r8aEevn53AkSUfT5Er/LmDNEW1bgUczcxXwaJkHuAJYVV6bgDug80sCuAm4CLgQuGnmF4UkaXh6hn5m/hQ4dETzWmBnmd4JXN3Vfnd2PAGcGhFnA5cDuzLzUGYeBnbx8V8kkqR5tvQY1xvLzANl+m1grEwvA97q6revtM3V/jERsYnOuwTGxsZot9t9D25qaop2u82Wc6f7XvdE1m+tZuqk3qxVM9apmVHW6VhD/yOZmRGRgxhM2d52YDvAxMREtlqtvrfRbrdptVps2PrQoIZ1Qth7Xauv/jN1Um/Wqhnr1Mwo63SsT++8U27bUH4eLO37gRVd/ZaXtrnaJUlDdKyh/yAw8wTOeuCBrvbry1M8FwPvldtAjwCXRcRp5Q+4l5U2SdIQ9by9ExHfB1rAmRGxj85TONuA+yNiI/AmcG3p/jBwJTAJ/Ab4CkBmHoqIbwJPl37fyMwj/zgsSZpnPUM/M780x6JLZ+mbwOY5trMD2NHX6CRJA+UnciWpIoa+JFXE0Jekihz3c/paOMb7/FzClnOnB/ZZhr3brhrIdiTNL6/0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SarI0lEPQIvD+NaHRrLfvduuGsl+pROVV/qSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0JekivjhLJ3QhvWhsC3nTrPhiH35wTCdiLzSl6SKGPqSVJGh396JiDXAbcAS4LuZuW3YY5AGwe8b0oloqFf6EbEE+DfgCmA18KWIWD3MMUhSzYZ9pX8hMJmZrwNExL3AWuDlIY9DOmGN6h1GE7P9wXsQfHczOJGZw9tZxDXAmsz82zL/ZeCizLyhq88mYFOZ/RPglWPY1ZnAL45zuDWwTs1Zq2asUzPzXac/zMyzZluw4B7ZzMztwPbj2UZEPJOZEwMa0qJlnZqzVs1Yp2ZGWadhP72zH1jRNb+8tEmShmDYof80sCoiVkbEJ4F1wINDHoMkVWuot3cyczoibgAeofPI5o7M3D0Puzqu20MVsU7NWatmrFMzI6vTUP+QK0kaLT+RK0kVMfQlqSKLLvQjYk1EvBIRkxGxddTjWUgiYm9EvBgRz0fEM6Xt9IjYFRGvlp+njXqcwxYROyLiYES81NU2a12i4zvl/HohIi4Y3ciHa446/VNE7C/n1PMRcWXXshtLnV6JiMtHM+rhi4gVEfF4RLwcEbsj4qulfUGcU4sq9P2ah0b+MjPP63pGeCvwaGauAh4t87W5C1hzRNtcdbkCWFVem4A7hjTGheAuPl4ngFvLOXVeZj4MUP7drQPOKevcXv591mAa2JKZq4GLgc2lHgvinFpUoU/X1zxk5v8AM1/zoLmtBXaW6Z3A1aMbymhk5k+BQ0c0z1WXtcDd2fEEcGpEnD2UgY7YHHWay1rg3sz8IDPfACbp/Ptc9DLzQGb+rEz/GtgDLGOBnFOLLfSXAW91ze8rbepI4D8i4tnydRcAY5l5oEy/DYyNZmgLzlx18Rz7uBvKbYkdXbcHrRMQEePA+cCTLJBzarGFvo7u85l5AZ23k5sj4i+6F2bn+V2f4T2CdTmqO4A/Bs4DDgD/MtLRLCAR8Wngh8DXMvNX3ctGeU4tttD3ax6OIjP3l58HgR/Tebv9zsxbyfLz4OhGuKDMVRfPsS6Z+U5mfpiZvwP+nf+/hVN1nSLiJDqB/73M/FFpXhDn1GILfb/mYQ4RcUpE/P7MNHAZ8BKd+qwv3dYDD4xmhAvOXHV5ELi+PHFxMfBe11v26hxx7/kLdM4p6NRpXUScHBEr6fyR8qlhj28UIiKAO4E9mfntrkUL45zKzEX1Aq4E/ht4Dfj6qMezUF7AHwE/L6/dM7UBzqDzJMGrwH8Cp496rCOozffp3Jr4Xzr3UzfOVRcg6Dwh9hrwIjAx6vGPuE73lDq8QCe8zu7q//VSp1eAK0Y9/iHW6fN0bt28ADxfXlculHPKr2GQpIostts7kqSjMPQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRf4PEn2CCWX0GoMAAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Where are the taboo matches?\n", | |
"indices = longestnontaboo.taboo_matches(ngrams, taboo)\n", | |
"# What is the distance between consecutive matches? (x-axis)\n", | |
"# how common is each distance? (y-axis)\n", | |
"(indices[1:] - indices[:-1].values).hist();" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"jane fairfax , standing with her back to them , intent on her pianoforte . busy as he was , however , the young man was yet able to shew a most happy countenance on seeing emma again . `` this is a pleasure , '' said he , in rather a low voice , `` coming at least ten minutes earlier than i had calculated . you find me trying to be useful ; tell me if you think i shall succeed . '' `` what ! '' said mrs. weston , `` have not you finished it yet ? you would not earn a very good livelihood as a working silversmith at this rate . '' `` i have not been working uninterruptedly , '' he replied , `` i have been assisting miss fairfax in trying to make her instrument stand steadily , it was not quite firm ; an unevenness in the floor , i believe . you see we have been wedging one leg with paper . this was very kind of you to be persuaded to come . i was almost afraid you would be hurrying home . '' he contrived that she should be seated by him ;\n" | |
] | |
} | |
], | |
"source": [ | |
"# Show the longest non-taboo sequence\n", | |
"result = longestnontaboo.longest_non_taboo_sequence(ngrams, taboo)\n", | |
"print(' '.join(result))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Given a set of taboo n-grams and a text, find the longest sequence of tokens | |
without any of the taboo n-grams. An n-gram is represented as a string of | |
space-separated tokens. | |
Usage: longestnontaboo.py [--len] <taboofile> <filenames...> | |
Specify taboo n-grams with one n-gram per line in taboofile; | |
multiple filenames can be specified. | |
By default the output is the longest non-taboo sequence. | |
With --len, only the length of this sequence is reported.""" | |
import sys | |
import nltk | |
import pandas | |
def preprocess(fname, lowercase=True): | |
"""Tokenize and lowercase text file.""" | |
with open(fname, encoding='utf8') as inp: | |
text = inp.read() | |
if lowercase: | |
text = text.lower() | |
tokens = nltk.word_tokenize(text) | |
return pandas.Series(tokens, name=fname) | |
def getngrams(tokens, taboo): | |
"""return list `ngrams` s.t. ngrams[n - 1] contains n-grams of tokens, | |
up to a maximum n determined by n-gram strings in `taboo`.""" | |
result = [tokens] | |
if not taboo: | |
return result | |
maxn = max(len(a.split()) for a in taboo) | |
# padding for the last n-grams of the text | |
padding = pandas.Series([''] * maxn) | |
padded = tokens.append(padding, ignore_index=True) | |
for n in range(2, maxn + 1): | |
subngrams = [padded[m:len(tokens) + m].values for m in range(1, n)] | |
ngrams = tokens.str.cat(subngrams, sep=' ') | |
result.append(ngrams) | |
return result | |
def taboo_matches(ngrams, taboo): | |
"""return all indices of tokens with taboo n-grams. | |
:param ngrams: a list of Series objects, must include n-grams up to | |
the longest n-gram in taboo. | |
:params taboo: a set of strings.""" | |
tokens = ngrams[0] | |
istaboo = tokens.isin(taboo) | |
for x in ngrams[1:]: | |
istaboo |= x.isin(taboo) | |
if not istaboo.any(): | |
return tokens | |
indices = tokens[istaboo].index | |
return pandas.Series(indices) | |
def longest_non_taboo_sequence(ngrams, taboo): | |
"""return longest sequence of tokens without any taboo n-grams. | |
:param ngrams: a list of Series objects, must include n-grams up to | |
the longest n-gram in taboo. | |
:params taboo: a set of strings.""" | |
indices = taboo_matches(ngrams, taboo) | |
tokens = ngrams[0] | |
if len(indices) == 0: | |
return tokens | |
# pad the begin and end such since the first or last tokens may | |
# be the longest non-taboo sequence | |
indices = pandas.Series([-1] + list(indices) + [len(tokens)]) | |
lengths = indices[1:].values - indices[:-1].values | |
longest = lengths.argmax() | |
start = indices[longest] | |
end = indices[longest + 1] | |
if start == -1: | |
start = 0 | |
else: | |
start += max(n for a in taboo | |
for n, x in enumerate(ngrams, 1) if x[start] == a) | |
return tokens[start:end] | |
def test_getngrams(): | |
tokens = pandas.Series(['a', 'b', 'c']) | |
assert getngrams(tokens, set()) == [tokens] | |
assert getngrams(tokens, {'a'}) == [tokens] | |
assert len(getngrams(tokens, {'a', 'a b'})) == 2 | |
assert len(getngrams(tokens, {'a', 'a b', 'a b c'})) == 3 | |
assert (getngrams(tokens, {'a', 'a b'})[1] == pandas.Series( | |
['a b', 'b c', 'c '])).all() | |
assert (getngrams(tokens, {'a', 'a b', 'a b c'})[2] == pandas.Series( | |
['a b c', 'b c ', 'c '])).all() | |
def test_longest_non_taboo_sequence(): | |
text = 'the cat is on the mat . the cat is sleeping .' | |
tokens = pandas.Series(text.split()) | |
ngrams = getngrams(tokens, {'the cat'}) | |
assert len(longest_non_taboo_sequence(ngrams, {'the cat'})) == 5 | |
assert (longest_non_taboo_sequence(ngrams, {'the cat'}) | |
== 'is on the mat .'.split()).all() | |
assert (longest_non_taboo_sequence(ngrams, {'the cat', 'the'}) | |
== 'is sleeping .'.split()).all() | |
assert (longest_non_taboo_sequence(ngrams, {'mat'}) | |
== '. the cat is sleeping .'.split()).all() | |
assert (longest_non_taboo_sequence(ngrams, {'sleeping'}) | |
== 'the cat is on the mat . the cat is'.split()).all() | |
def main(*argv): | |
"""CLI.""" | |
if len(argv) < 2: | |
print(__doc__) | |
return | |
reportlen = '--len' in sys.argv | |
if reportlen: | |
argv.remove('--len') | |
taboofile = argv[0] | |
textfnames = argv[1:] | |
with open(taboofile, encoding='utf8') as inp: | |
taboo = set(inp.read().splitlines()) | |
for fname in textfnames: | |
tokens = preprocess(fname) | |
ngrams = getngrams(tokens, taboo) | |
result = longest_non_taboo_sequence(ngrams, taboo) | |
if reportlen: | |
print('%s: %d' % (fname, len(result))) | |
else: | |
print('%s: %s' % (fname, ' '.join(result))) | |
if __name__ == '__main__': | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment