Skip to content

Instantly share code, notes, and snippets.

@sampsyo
Created May 13, 2015 17:36
Show Gist options
  • Save sampsyo/852f11b840f902b35165 to your computer and use it in GitHub Desktop.
Save sampsyo/852f11b840f902b35165 to your computer and use it in GitHub Desktop.
find the common n-grams between two documents
#!/usr/bin/env python3
import sys
def words(fn):
with open(fn) as f:
for line in f:
for word in line.split():
yield word.strip(".,?!'")
def ngrams(l, n):
return zip(*[l[i:] for i in range(n)])
def common_ngrams(l1, l2, n):
ngrams1 = set(ngrams(l1, n))
ngrams2 = set(ngrams(l2, n))
return ngrams1.intersection(ngrams2)
def plagcheck(doc1, doc2):
words1 = list(words(doc1))
words2 = list(words(doc2))
for n in range(5, 10):
c = common_ngrams(words1, words2, n)
for phrase in c:
print(' '.join(phrase))
if __name__ == '__main__':
plagcheck(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment