Last active
December 3, 2017 01:32
-
-
Save rcdilorenzo/9b410488df9e411dff53469a2fb0b0c0 to your computer and use it in GitHub Desktop.
Text analysis comparison between the Synoptic Gospels Matthew and Luke. Created as a part of a graduate assignment for an M.S. in Data Science from Regis University. (Simply run `python bisynoptic_text_analysis.py` from the current directory with all the files downloaded)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from word_frequency import WordFrequency | |
luke = WordFrequency('Luke', './Luke.txt') | |
matthew = WordFrequency('Matt', './Matthew.txt') | |
print('================================') | |
print('= Frequently Occurring Phrases =') | |
print('= Between Matthew & Luke (Max =') | |
print('= 4 words/phrase, Min freq 15) =') | |
print('================================') | |
print('') | |
mtl_comparisons = matthew.compare_to(luke) | |
for comparison in mtl_comparisons: | |
print(str(comparison)) | |
ltm_comparisons = luke.compare_to(matthew) | |
for comparison in ltm_comparisons: | |
print(str(comparison)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
================================ | |
= Frequently Occurring Phrases = | |
= Between Matthew & Luke (Max = | |
= 4 words/phrase, Min freq 15) = | |
================================ | |
Matt: 'man'|127 Luke: 'man'|132, 'the son of man'|26, 'son of man is'|5 | |
Matt: 'heaven'|59 Luke: 'heaven'|20 | |
Matt: 'kingdom'|55 Luke: 'kingdom'|43, 'the kingdom of god'|33, 'kingdom of god and'|10, 'kingdom of god is'|7, 'in the kingdom of'|5 | |
Matt: 'day'|39 Luke: 'day'|64, 'on the sabbath day'|8, 'the sabbath day and'|5 | |
Matt: 'house'|36 Luke: 'house'|58 | |
Matt: 'the kingdom of heaven'|33 Luke: Less than 5 | |
Matt: 'the son of man'|32 Luke: 'the son of man'|26 | |
Matt: 'hand'|31 Luke: 'hand'|19 | |
Matt: 'way'|28 Luke: 'way'|24 | |
Matt: 'mother'|27 Luke: 'mother'|22 | |
Matt: 'multitude'|26 Luke: 'multitude'|18 | |
Matt: 'earth'|25 Luke: 'earth'|15 | |
Matt: 'time'|25 Luke: 'time'|23 | |
Matt: 'prophet'|24 Luke: 'prophet'|9 | |
Matt: 'son'|22 Luke: 'son'|101, 'which was the son'|75, 'was the son of'|75, 'the son of man'|26, 'son of man is'|5 | |
Matt: 'city'|22 Luke: 'city'|36 | |
Matt: 'forth'|21 Luke: 'forth'|14 | |
Matt: 'hour'|21 Luke: 'hour'|13 | |
Matt: 'father'|20 Luke: 'father'|25 | |
Matt: 'lord'|20 Luke: 'lord'|15 | |
Matt: 'temple'|20 Luke: 'temple'|18 | |
Matt: 'name'|20 Luke: 'name'|23 | |
Matt: 'brother'|19 Luke: 'brother'|12 | |
Matt: 'word'|19 Luke: 'word'|20, 'the word of god'|5 | |
Matt: 'world'|18 Luke: 'world'|10 | |
Matt: 'sea'|18 Luke: Less than 5 | |
Matt: 'child'|18 Luke: 'child'|17 | |
Matt: 'wife'|18 Luke: 'wife'|19 | |
Matt: 'hast'|16 Luke: 'hast'|14 | |
Matt: 'eye'|16 Luke: 'eye'|10 | |
Matt: 'body'|15 Luke: 'body'|13 | |
Matt: 'field'|15 Luke: 'field'|6 | |
Matt: 'life'|15 Luke: 'life'|15 | |
Matt: 'heart'|15 Luke: 'heart'|12 | |
Matt: 'fruit'|15 Luke: 'fruit'|12 | |
Luke: 'man'|132 Matt: 'man'|127, 'the son of man'|32, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5 | |
Luke: 'son'|101 Matt: 'the son of man'|32, 'son'|22, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5 | |
Luke: 'which was the son'|75 Matt: Less than 5 | |
Luke: 'was the son of'|75 Matt: Less than 5 | |
Luke: 'day'|64 Matt: 'day'|39 | |
Luke: 'house'|58 Matt: 'house'|36 | |
Luke: 'kingdom'|43 Matt: 'kingdom'|55, 'the kingdom of heaven'|33, 'kingdom of heaven is'|14, 'in the kingdom of'|7, 'into the kingdom of'|6, 'for the kingdom of'|5, 'enter into the kingdom'|5, 'the kingdom of god'|5, 'kingdom of heaven and'|5 | |
Luke: 'city'|36 Matt: 'city'|22 | |
Luke: 'the kingdom of god'|33 Matt: 'the kingdom of god'|5 | |
Luke: 'the son of man'|26 Matt: 'the son of man'|32 | |
Luke: 'father'|25 Matt: 'father'|20 | |
Luke: 'way'|24 Matt: 'way'|28 | |
Luke: 'name'|23 Matt: 'name'|20, 'in the name of'|6 | |
Luke: 'time'|23 Matt: 'time'|25 | |
Luke: 'mother'|22 Matt: 'mother'|27 | |
Luke: 'heaven'|20 Matt: 'heaven'|59, 'the kingdom of heaven'|33, 'which is in heaven'|14, 'kingdom of heaven is'|14, 'of heaven is like'|8, 'kingdom of heaven and'|5 | |
Luke: 'word'|20 Matt: 'word'|19 | |
Luke: 'hand'|19 Matt: 'hand'|31 | |
Luke: 'wife'|19 Matt: 'wife'|18 | |
Luke: 'peace'|18 Matt: 'peace'|6 | |
Luke: 'sabbath'|18 Matt: 'sabbath'|9 | |
Luke: 'temple'|18 Matt: 'temple'|20, 'in the temple and'|5 | |
Luke: 'multitude'|18 Matt: 'multitude'|26 | |
Luke: 'power'|18 Matt: 'power'|8 | |
Luke: 'woman'|17 Matt: 'woman'|10 | |
Luke: 'servant'|17 Matt: 'servant'|9 | |
Luke: 'child'|17 Matt: 'child'|18, 'the young child and'|5 | |
Luke: 'meat'|16 Matt: 'meat'|10 | |
Luke: 'country'|16 Matt: 'country'|8 | |
Luke: 'place'|16 Matt: 'place'|13 | |
Luke: 'spirit'|16 Matt: 'spirit'|5 | |
Luke: 'earth'|15 Matt: 'earth'|25 | |
Luke: 'lord'|15 Matt: 'lord'|20, 'angel of the lord'|5 | |
Luke: 'life'|15 Matt: 'life'|15 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from termcolor import colored | |
PHRASE_MATCH_THRESHOLD = 0.4 | |
PADDING = 45 | |
class PhraseComparison: | |
def __init__(self, collection_name, phrase, count, | |
other_collection_name, other_results, threshold): | |
self.collection_name = collection_name | |
self.phrase = phrase | |
self.count = count | |
self.other_collection_name = other_collection_name | |
self.threshold = threshold | |
self.__compare(other_results) | |
def __compare(self, other_results): | |
phrase_matches = [] | |
words = self.phrase.split() | |
for (other_phrase, other_count) in other_results: | |
matches = 0 | |
for word in self.phrase.split(): | |
if word in other_phrase.split(): | |
matches += 1 | |
if (matches / len(words)) >= PHRASE_MATCH_THRESHOLD: | |
phrase_matches.append(dict( | |
phrase = other_phrase, | |
count = other_count | |
)) | |
self.phrase_matches = phrase_matches | |
def __str__(self): | |
matches = map(self.__str_match_phrase, self.phrase_matches) | |
less_than_msg = colored('Less than ' + str(self.threshold), 'red') | |
matches_str = ', '.join(matches) if len(matches) > 0 else less_than_msg | |
count_str = colored(str(self.count), 'blue', attrs = ['bold']) | |
left = self.collection_name + ": '" + self.phrase + "'|" + count_str | |
right = self.other_collection_name + ": " + matches_str | |
return left.ljust(PADDING) + " " + right | |
def __str_match_phrase(self, result): | |
color = 'green' if result['count'] >= self.count else 'red' | |
return "'" + result['phrase'] + "'|" + colored(str(result['count']), color) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import re | |
import operator | |
from nltk.util import ngrams | |
from nltk.tokenize import WordPunctTokenizer | |
from phrase_comparison import PhraseComparison | |
THRESHOLD = 5 | |
class WordFrequency: | |
kjv_stopwords = ["thy", "thou", "art", "hath", "ye", "thee", "thine", "shalt", "unto"] | |
def __init__(self, name, filename, gram_count = 4, tokenizer = WordPunctTokenizer()): | |
self.name = name | |
self.filename = filename | |
self.gram_count = gram_count | |
self.tokenizer = tokenizer | |
self.run() | |
def run(self): | |
self.results = self.__top_words(self.__compact_content(self.filename)) | |
def compare_to(self, other, min_frequency = 15): | |
return [self.__compare(phrase, count, other) | |
for (phrase, count) in self.results | |
if count >= min_frequency] | |
def print_results(self): | |
if len(self.results) == 0: | |
self.run() | |
print(self.name) | |
print(self.results) | |
def __compare(self, phrase, count, other): | |
return PhraseComparison( | |
self.name, phrase, count, other.name, other.results, THRESHOLD | |
) | |
def __should_append_word(self, word): | |
lower = word.lower() | |
return (len(lower) > 1 and re.match(r'^[a-zA-Z\s]+$', lower) and | |
lower not in self.kjv_stopwords) | |
def __should_append(self, word, pos): | |
return self.__should_append_word(word) and (pos == "NN" or pos == "GRAM") | |
def __append_word(self, word, acc): | |
lower = word.lower() | |
acc[lower] = acc.get(lower, 0) + 1 | |
def __contains_noun(self, grams): | |
return any(["NN" == pos for (_word, pos) in nltk.pos_tag(grams)]) | |
def __combined_grams(self, tokens): | |
list_of_subgrams = ngrams(tokens, self.gram_count) | |
return [(' '.join(subgrams), "GRAM") | |
for subgrams in list_of_subgrams | |
if self.__contains_noun(subgrams)] | |
def __word_freq(self, content): | |
freq_count = dict() | |
tokens = [token for token in self.tokenizer.tokenize(content) | |
if self.__should_append_word(token)] | |
grams = self.__combined_grams(tokens) | |
tagged = nltk.pos_tag(tokens) + grams | |
[self.__append_word(word, freq_count) | |
for (word, pos) in tagged | |
if self.__should_append(word, pos)] | |
return freq_count | |
def __top_words(self, content): | |
freq = {word: count | |
for word, count in self.__word_freq(content).items() | |
if count >= THRESHOLD} | |
return sorted(freq.items(), key = operator.itemgetter(1), reverse = True) | |
def __compact_content(self, name): | |
f = open(name) | |
content = "" | |
for line in f: | |
content += " " + line.strip() | |
return content |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment