Last active
December 24, 2015 21:49
-
-
Save ramhiser/6868727 to your computer and use it in GitHub Desktop.
Brief analysis of the collocations of the Monty Python and the Holy Grail script.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from nltk.collocations import * | |
| from nltk.book import * | |
| import re | |
| bigram_measures = nltk.collocations.BigramAssocMeasures() | |
| # Monty Python and the Holy Grail | |
| # Reduces tokens to words. Ignores ALL CAPS words, which are the speaker in the movie. | |
| # Then, creates a single string of the words. | |
| montypython_words = sum([re.findall(r'[A-Z]?[a-z]+', token) for token in text6.tokens], []) | |
| # Finds all bigrams in the Monty Python script | |
| # Ignore all bigrams which occur less than three times in the corpus | |
| finder = BigramCollocationFinder.from_words(montypython_words) | |
| finder.apply_freq_filter(3) | |
| # Finds top 10 bigrams using the Student's t test | |
| # Also, calculates the Student's t test for all bigrams | |
| finder.nbest(bigram_measures.student_t, 10) | |
| finder.score_ngrams(bigram_measures.student_t) | |
| # Finds top 10 bigrams using Pointwise Mutual Information (PMI) | |
| # Also, calculates the PMI for all bigrams | |
| finder.nbest(bigram_measures.pmi, 10) | |
| finder.score_ngrams(bigram_measures.pmi) | |
| # Finds top 10 bigrams using the likelihood ratio | |
| # Also, calculates the likelihood ratio for all bigrams | |
| finder.nbest(bigram_measures.likelihood_ratio, 10) | |
| finder.score_ngrams(bigram_measures.likelihood_ratio) | |
| # Finds top 10 bigrams using the Pearson's Chi-squared test | |
| # Also, calculates the Pearson's Chi-squared test for all bigrams | |
| finder.nbest(bigram_measures.chi_sq, 10) | |
| finder.score_ngrams(bigram_measures.chi_sq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment