Created
May 14, 2015 12:37
-
-
Save sirex/7a6972c3e854f5107e82 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import re | |
import sys | |
import time | |
token_re = re.compile(r"[a-ząčęėįšųūž]+", re.IGNORECASE) | |
def tokenize(s): | |
return token_re.findall(s) | |
def ngramize(s, min_n=2, max_n=None): | |
n = len(s) | |
max_n = min_n if max_n is None else max_n | |
step = -1 if min_n > max_n else 1 | |
for i in range(n-min(min_n, max_n)+1): | |
_min_n = min_n if i+min_n < n else n-i | |
_max_n = max_n if i+max_n < n else n-i | |
for j in range(_min_n, _max_n+step, step): | |
yield tuple(s[i:i+j]) | |
def get_phrases(phrases): | |
for phrase in phrases: | |
yield ' '.join(tokenize(phrase.strip())) | |
def find(document, phrases, min_n, max_n): | |
tokens = tokenize(document) | |
ngrams = set(map(' '.join, ngramize(tokens, min_n, max_n))) | |
return ngrams & phrases | |
def main(argv=None): | |
""" | |
This script finds phrases from given phrases file in a document. | |
In phrases file each phrase is separated by newline and words are separated | |
by spaces. | |
Document can contain any text or even HTML. This text will be normalized | |
and tokenized by words. | |
Script will search for all occurrences of phrases in given document text | |
and lists all found phrases. | |
Usage: | |
$ ./findintersect.py <phrases file> <document file> | |
$ wget http://google.com/ -qO- | ./findintersect.py <phrases file> - | |
""" | |
parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) | |
parser.add_argument('phrases', type=argparse.FileType('r', encoding='utf-8')) | |
parser.add_argument('document', type=argparse.FileType('r', encoding='utf-8')) | |
parser.add_argument('--ngrams', default='2', help='Number of ngrams, example: 2, 1-5') | |
args = parser.parse_args(argv or sys.argv[1:]) | |
min_n, max_n = map(int, args.ngrams.split('-') if '-' in args.ngrams else (args.ngrams, args.ngrams)) | |
with args.document as document, args.phrases as phrases: | |
# Read and prepare data | |
params = document.read(), set(get_phrases(phrases)), min_n, max_n | |
# Do the work | |
start = time.time() | |
result = find(*params) | |
stop = time.time() - start | |
# Print result | |
print('\n'.join(sorted(result))) | |
print('\nTime: %f s.' % stop) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment