Skip to content

Instantly share code, notes, and snippets.

@sirex
Created May 14, 2015 12:37
Show Gist options
  • Save sirex/7a6972c3e854f5107e82 to your computer and use it in GitHub Desktop.
Save sirex/7a6972c3e854f5107e82 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import argparse
import re
import sys
import time
token_re = re.compile(r"[a-ząčęėįšųūž]+", re.IGNORECASE)
def tokenize(s):
return token_re.findall(s)
def ngramize(s, min_n=2, max_n=None):
n = len(s)
max_n = min_n if max_n is None else max_n
step = -1 if min_n > max_n else 1
for i in range(n-min(min_n, max_n)+1):
_min_n = min_n if i+min_n < n else n-i
_max_n = max_n if i+max_n < n else n-i
for j in range(_min_n, _max_n+step, step):
yield tuple(s[i:i+j])
def get_phrases(phrases):
for phrase in phrases:
yield ' '.join(tokenize(phrase.strip()))
def find(document, phrases, min_n, max_n):
tokens = tokenize(document)
ngrams = set(map(' '.join, ngramize(tokens, min_n, max_n)))
return ngrams & phrases
def main(argv=None):
"""
This script finds phrases from given phrases file in a document.
In phrases file each phrase is separated by newline and words are separated
by spaces.
Document can contain any text or even HTML. This text will be normalized
and tokenized by words.
Script will search for all occurrences of phrases in given document text
and lists all found phrases.
Usage:
$ ./findintersect.py <phrases file> <document file>
$ wget http://google.com/ -qO- | ./findintersect.py <phrases file> -
"""
parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('phrases', type=argparse.FileType('r', encoding='utf-8'))
parser.add_argument('document', type=argparse.FileType('r', encoding='utf-8'))
parser.add_argument('--ngrams', default='2', help='Number of ngrams, example: 2, 1-5')
args = parser.parse_args(argv or sys.argv[1:])
min_n, max_n = map(int, args.ngrams.split('-') if '-' in args.ngrams else (args.ngrams, args.ngrams))
with args.document as document, args.phrases as phrases:
# Read and prepare data
params = document.read(), set(get_phrases(phrases)), min_n, max_n
# Do the work
start = time.time()
result = find(*params)
stop = time.time() - start
# Print result
print('\n'.join(sorted(result)))
print('\nTime: %f s.' % stop)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment