Skip to content

Instantly share code, notes, and snippets.

@rcdilorenzo
Last active December 3, 2017 01:32
Show Gist options
  • Save rcdilorenzo/9b410488df9e411dff53469a2fb0b0c0 to your computer and use it in GitHub Desktop.
Save rcdilorenzo/9b410488df9e411dff53469a2fb0b0c0 to your computer and use it in GitHub Desktop.
Text analysis comparison between the Synoptic Gospels Matthew and Luke. Created as a part of a graduate assignment for an M.S. in Data Science from Regis University. (Simply run `python bisynoptic_text_analysis.py` from the current directory with all the files downloaded)
from word_frequency import WordFrequency
luke = WordFrequency('Luke', './Luke.txt')
matthew = WordFrequency('Matt', './Matthew.txt')
print('================================')
print('= Frequently Occurring Phrases =')
print('= Between Matthew & Luke (Max =')
print('= 4 words/phrase, Min freq 15) =')
print('================================')
print('')
mtl_comparisons = matthew.compare_to(luke)
for comparison in mtl_comparisons:
print(str(comparison))
ltm_comparisons = luke.compare_to(matthew)
for comparison in ltm_comparisons:
print(str(comparison))
================================
= Frequently Occurring Phrases =
= Between Matthew & Luke (Max =
= 4 words/phrase, Min freq 15) =
================================
Matt: 'man'|127 Luke: 'man'|132, 'the son of man'|26, 'son of man is'|5
Matt: 'heaven'|59 Luke: 'heaven'|20
Matt: 'kingdom'|55 Luke: 'kingdom'|43, 'the kingdom of god'|33, 'kingdom of god and'|10, 'kingdom of god is'|7, 'in the kingdom of'|5
Matt: 'day'|39 Luke: 'day'|64, 'on the sabbath day'|8, 'the sabbath day and'|5
Matt: 'house'|36 Luke: 'house'|58
Matt: 'the kingdom of heaven'|33 Luke: Less than 5
Matt: 'the son of man'|32 Luke: 'the son of man'|26
Matt: 'hand'|31 Luke: 'hand'|19
Matt: 'way'|28 Luke: 'way'|24
Matt: 'mother'|27 Luke: 'mother'|22
Matt: 'multitude'|26 Luke: 'multitude'|18
Matt: 'earth'|25 Luke: 'earth'|15
Matt: 'time'|25 Luke: 'time'|23
Matt: 'prophet'|24 Luke: 'prophet'|9
Matt: 'son'|22 Luke: 'son'|101, 'which was the son'|75, 'was the son of'|75, 'the son of man'|26, 'son of man is'|5
Matt: 'city'|22 Luke: 'city'|36
Matt: 'forth'|21 Luke: 'forth'|14
Matt: 'hour'|21 Luke: 'hour'|13
Matt: 'father'|20 Luke: 'father'|25
Matt: 'lord'|20 Luke: 'lord'|15
Matt: 'temple'|20 Luke: 'temple'|18
Matt: 'name'|20 Luke: 'name'|23
Matt: 'brother'|19 Luke: 'brother'|12
Matt: 'word'|19 Luke: 'word'|20, 'the word of god'|5
Matt: 'world'|18 Luke: 'world'|10
Matt: 'sea'|18 Luke: Less than 5
Matt: 'child'|18 Luke: 'child'|17
Matt: 'wife'|18 Luke: 'wife'|19
Matt: 'hast'|16 Luke: 'hast'|14
Matt: 'eye'|16 Luke: 'eye'|10
Matt: 'body'|15 Luke: 'body'|13
Matt: 'field'|15 Luke: 'field'|6
Matt: 'life'|15 Luke: 'life'|15
Matt: 'heart'|15 Luke: 'heart'|12
Matt: 'fruit'|15 Luke: 'fruit'|12
Luke: 'man'|132 Matt: 'man'|127, 'the son of man'|32, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5
Luke: 'son'|101 Matt: 'the son of man'|32, 'son'|22, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5
Luke: 'which was the son'|75 Matt: Less than 5
Luke: 'was the son of'|75 Matt: Less than 5
Luke: 'day'|64 Matt: 'day'|39
Luke: 'house'|58 Matt: 'house'|36
Luke: 'kingdom'|43 Matt: 'kingdom'|55, 'the kingdom of heaven'|33, 'kingdom of heaven is'|14, 'in the kingdom of'|7, 'into the kingdom of'|6, 'for the kingdom of'|5, 'enter into the kingdom'|5, 'the kingdom of god'|5, 'kingdom of heaven and'|5
Luke: 'city'|36 Matt: 'city'|22
Luke: 'the kingdom of god'|33 Matt: 'the kingdom of god'|5
Luke: 'the son of man'|26 Matt: 'the son of man'|32
Luke: 'father'|25 Matt: 'father'|20
Luke: 'way'|24 Matt: 'way'|28
Luke: 'name'|23 Matt: 'name'|20, 'in the name of'|6
Luke: 'time'|23 Matt: 'time'|25
Luke: 'mother'|22 Matt: 'mother'|27
Luke: 'heaven'|20 Matt: 'heaven'|59, 'the kingdom of heaven'|33, 'which is in heaven'|14, 'kingdom of heaven is'|14, 'of heaven is like'|8, 'kingdom of heaven and'|5
Luke: 'word'|20 Matt: 'word'|19
Luke: 'hand'|19 Matt: 'hand'|31
Luke: 'wife'|19 Matt: 'wife'|18
Luke: 'peace'|18 Matt: 'peace'|6
Luke: 'sabbath'|18 Matt: 'sabbath'|9
Luke: 'temple'|18 Matt: 'temple'|20, 'in the temple and'|5
Luke: 'multitude'|18 Matt: 'multitude'|26
Luke: 'power'|18 Matt: 'power'|8
Luke: 'woman'|17 Matt: 'woman'|10
Luke: 'servant'|17 Matt: 'servant'|9
Luke: 'child'|17 Matt: 'child'|18, 'the young child and'|5
Luke: 'meat'|16 Matt: 'meat'|10
Luke: 'country'|16 Matt: 'country'|8
Luke: 'place'|16 Matt: 'place'|13
Luke: 'spirit'|16 Matt: 'spirit'|5
Luke: 'earth'|15 Matt: 'earth'|25
Luke: 'lord'|15 Matt: 'lord'|20, 'angel of the lord'|5
Luke: 'life'|15 Matt: 'life'|15
from termcolor import colored
PHRASE_MATCH_THRESHOLD = 0.4
PADDING = 45
class PhraseComparison:
def __init__(self, collection_name, phrase, count,
other_collection_name, other_results, threshold):
self.collection_name = collection_name
self.phrase = phrase
self.count = count
self.other_collection_name = other_collection_name
self.threshold = threshold
self.__compare(other_results)
def __compare(self, other_results):
phrase_matches = []
words = self.phrase.split()
for (other_phrase, other_count) in other_results:
matches = 0
for word in self.phrase.split():
if word in other_phrase.split():
matches += 1
if (matches / len(words)) >= PHRASE_MATCH_THRESHOLD:
phrase_matches.append(dict(
phrase = other_phrase,
count = other_count
))
self.phrase_matches = phrase_matches
def __str__(self):
matches = map(self.__str_match_phrase, self.phrase_matches)
less_than_msg = colored('Less than ' + str(self.threshold), 'red')
matches_str = ', '.join(matches) if len(matches) > 0 else less_than_msg
count_str = colored(str(self.count), 'blue', attrs = ['bold'])
left = self.collection_name + ": '" + self.phrase + "'|" + count_str
right = self.other_collection_name + ": " + matches_str
return left.ljust(PADDING) + " " + right
def __str_match_phrase(self, result):
color = 'green' if result['count'] >= self.count else 'red'
return "'" + result['phrase'] + "'|" + colored(str(result['count']), color)
import nltk
import re
import operator
from nltk.util import ngrams
from nltk.tokenize import WordPunctTokenizer
from phrase_comparison import PhraseComparison
THRESHOLD = 5
class WordFrequency:
kjv_stopwords = ["thy", "thou", "art", "hath", "ye", "thee", "thine", "shalt", "unto"]
def __init__(self, name, filename, gram_count = 4, tokenizer = WordPunctTokenizer()):
self.name = name
self.filename = filename
self.gram_count = gram_count
self.tokenizer = tokenizer
self.run()
def run(self):
self.results = self.__top_words(self.__compact_content(self.filename))
def compare_to(self, other, min_frequency = 15):
return [self.__compare(phrase, count, other)
for (phrase, count) in self.results
if count >= min_frequency]
def print_results(self):
if len(self.results) == 0:
self.run()
print(self.name)
print(self.results)
def __compare(self, phrase, count, other):
return PhraseComparison(
self.name, phrase, count, other.name, other.results, THRESHOLD
)
def __should_append_word(self, word):
lower = word.lower()
return (len(lower) > 1 and re.match(r'^[a-zA-Z\s]+$', lower) and
lower not in self.kjv_stopwords)
def __should_append(self, word, pos):
return self.__should_append_word(word) and (pos == "NN" or pos == "GRAM")
def __append_word(self, word, acc):
lower = word.lower()
acc[lower] = acc.get(lower, 0) + 1
def __contains_noun(self, grams):
return any(["NN" == pos for (_word, pos) in nltk.pos_tag(grams)])
def __combined_grams(self, tokens):
list_of_subgrams = ngrams(tokens, self.gram_count)
return [(' '.join(subgrams), "GRAM")
for subgrams in list_of_subgrams
if self.__contains_noun(subgrams)]
def __word_freq(self, content):
freq_count = dict()
tokens = [token for token in self.tokenizer.tokenize(content)
if self.__should_append_word(token)]
grams = self.__combined_grams(tokens)
tagged = nltk.pos_tag(tokens) + grams
[self.__append_word(word, freq_count)
for (word, pos) in tagged
if self.__should_append(word, pos)]
return freq_count
def __top_words(self, content):
freq = {word: count
for word, count in self.__word_freq(content).items()
if count >= THRESHOLD}
return sorted(freq.items(), key = operator.itemgetter(1), reverse = True)
def __compact_content(self, name):
f = open(name)
content = ""
for line in f:
content += " " + line.strip()
return content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment