macleginn · October 2, 2020 09:23
diff --git a/first_comma_collocates.py b/first_comma_collocates.py
 import re
 import os
 from math import log
 from collections import Counter

 import pandas as pd


 def logL(p, k, n):
    return k * log(p) + (n - k) * log(1 - p)


 def logLikelihoodRatio(k1, n1, k2, n2):
    E1 = n1 * (k1 + k2) / (n1 + n2)
    E2 = n2 * (k1 + k2) / (n1 + n2)
    return 2 * ((k1 * log(k1 / E1)) + (k2 * log(k2 / E2)))


 def get_score(w, before_x_counts, not_before_x_counts, before_x_total, not_before_x_total):
    count = before_x_counts[w]
    k1 = count + 10**(-6)  # To avoid the log-of-zero error.
    n1 = count + not_before_x_counts[w]
    k2 = before_x_total - count + 10**(-6)
    n2 = not_before_x_total
    return logLikelihoodRatio(k1, n1, k2, n2)


 def split_sentences(line):
    """
    Splits text by {!, ?, .} and includes them in the output.
    Example usage:
    In[2]:  split_sentences('Hi. What is your name? Nice to meet you!')
    Out[2]: ['Hi.', 'What is your name?', 'Nice to meet you!'] 
    """
    indices = []
    for idx, c in enumerate(line):
        if c in ['.', '!', '?']:
            indices.append(idx)
    sentences = []
    if not indices:
        sentences.append(line)
    else:
        start_idx = 0
        for idx in indices:
            sentences.append(line[start_idx:(idx+1)].strip())
            start_idx = idx+1
        suffix = line[start_idx:]
        if suffix:
            sentences.append(suffix)
    return sentences


 if __name__ == '__main__':
    records = []
    binary_dict = {}  # Binary vectors indexed by line numbers indicating the presence of putative topic markers in different NT verses.
    line_pattern = re.compile(r'(\d+)\s+(.*)')
    for fname in os.listdir('../corpus'):
        if not fname.endswith('.txt'):
            continue
        print(fname)
        path = f'../corpus/{fname}'
        with open(path, 'r', encoding='utf-8') as inp:
            lines = inp.readlines()

        before_comma     = Counter()
        not_before_comma = Counter()
        before_stop      = Counter()
        not_before_stop  = Counter()

        for line in lines:
            if line.startswith('40001000'):
                break

        NT_lines = []
        NT_line_nos = []
        for line in lines:
            m = line_pattern.match(line)
            if m:
                line_n = m.group(1)
                if int(line_n) < 40001001:
                    continue
                line = m.group(2)
                NT_line_nos.append(line_n)
                NT_lines.append(line.split())  # For looking at by-verse distributions
                # Split into sentences, compute collocations with the first comma in each sentence
                for sentence in split_sentences(line):
                    tokens = sentence.split()  # The corpus is pretokenised
                    if len(tokens) <= 1:
                        continue
                    first_comma_found = False
                    for i in range(len(tokens)-1):
                        w1 = tokens[i]
                        w2 = tokens[i+1]
                        if w2 == ',' and not first_comma_found:
                            first_comma_found = True
                            before_comma[w1] += 1
                        else:
                            not_before_comma[w1] += 1
                        if w2 == '.':
                            before_stop[w1] += 1
                        else:
                            not_before_stop[w1] += 1

        word_rankings = {}
        before_comma_total = sum(before_comma.values())
        before_stop_total = sum(before_stop.values())
        not_before_comma_total = sum(not_before_comma.values())
        not_before_stop_total = sum(not_before_stop.values())
        for w in before_comma:
            comma_score = get_score(w, before_comma, not_before_comma, before_comma_total, not_before_comma_total)
            stop_score = get_score(w, before_stop, not_before_stop, before_stop_total, not_before_stop_total)
            word_rankings[w] = (comma_score, stop_score)

        results = sorted(word_rankings.items(), key=lambda x: x[1][0] - x[1][1], reverse=True)

        if results:
            tmp = []
            tmp.append(fname[:-4])
            for w, stat in results[:5]:
                tmp.append(w)
                tmp.append(round(stat[0]-stat[1]))
            records.append(tmp)

            # Now find the distribution in NT verses for the top-scoring word
            top_marker = results[0][0]
            key = f'{fname[:-4]}-{top_marker}'
            binary_dict[key] = {}
            for verse_n, verse in zip(NT_line_nos, NT_lines):
                binary_dict[key][verse_n] = 1 if top_marker in verse else 0

    result_df = pd.DataFrame.from_records(records, columns=['language', 'no1', 'score', 'no2', 'score', 'no3', 'score', 'no4', 'score', 'no5', 'score'])
    result_df.to_csv('result_first_comma.csv', index=False)
    binary_df = pd.DataFrame(binary_dict)
    binary_df.to_csv('binary_first_comma.csv')
	import re
	import os
	from math import log
	from collections import Counter

	import pandas as pd


	def logL(p, k, n):
	return k * log(p) + (n - k) * log(1 - p)


	def logLikelihoodRatio(k1, n1, k2, n2):
	E1 = n1 * (k1 + k2) / (n1 + n2)
	E2 = n2 * (k1 + k2) / (n1 + n2)
	return 2 * ((k1 * log(k1 / E1)) + (k2 * log(k2 / E2)))


	def get_score(w, before_x_counts, not_before_x_counts, before_x_total, not_before_x_total):
	count = before_x_counts[w]
	k1 = count + 10**(-6) # To avoid the log-of-zero error.
	n1 = count + not_before_x_counts[w]
	k2 = before_x_total - count + 10**(-6)
	n2 = not_before_x_total
	return logLikelihoodRatio(k1, n1, k2, n2)


	def split_sentences(line):
	"""
	Splits text by {!, ?, .} and includes them in the output.
	Example usage:
	In[2]: split_sentences('Hi. What is your name? Nice to meet you!')
	Out[2]: ['Hi.', 'What is your name?', 'Nice to meet you!']
	"""
	indices = []
	for idx, c in enumerate(line):
	if c in ['.', '!', '?']:
	indices.append(idx)
	sentences = []
	if not indices:
	sentences.append(line)
	else:
	start_idx = 0
	for idx in indices:
	sentences.append(line[start_idx:(idx+1)].strip())
	start_idx = idx+1
	suffix = line[start_idx:]
	if suffix:
	sentences.append(suffix)
	return sentences


	if __name__ == '__main__':
	records = []
	binary_dict = {} # Binary vectors indexed by line numbers indicating the presence of putative topic markers in different NT verses.
	line_pattern = re.compile(r'(\d+)\s+(.*)')
	for fname in os.listdir('../corpus'):
	if not fname.endswith('.txt'):
	continue
	print(fname)
	path = f'../corpus/{fname}'
	with open(path, 'r', encoding='utf-8') as inp:
	lines = inp.readlines()

	before_comma = Counter()
	not_before_comma = Counter()
	before_stop = Counter()
	not_before_stop = Counter()

	for line in lines:
	if line.startswith('40001000'):
	break

	NT_lines = []
	NT_line_nos = []
	for line in lines:
	m = line_pattern.match(line)
	if m:
	line_n = m.group(1)
	if int(line_n) < 40001001:
	continue
	line = m.group(2)
	NT_line_nos.append(line_n)
	NT_lines.append(line.split()) # For looking at by-verse distributions
	# Split into sentences, compute collocations with the first comma in each sentence
	for sentence in split_sentences(line):
	tokens = sentence.split() # The corpus is pretokenised
	if len(tokens) <= 1:
	continue
	first_comma_found = False
	for i in range(len(tokens)-1):
	w1 = tokens[i]
	w2 = tokens[i+1]
	if w2 == ',' and not first_comma_found:
	first_comma_found = True
	before_comma[w1] += 1
	else:
	not_before_comma[w1] += 1
	if w2 == '.':
	before_stop[w1] += 1
	else:
	not_before_stop[w1] += 1

	word_rankings = {}
	before_comma_total = sum(before_comma.values())
	before_stop_total = sum(before_stop.values())
	not_before_comma_total = sum(not_before_comma.values())
	not_before_stop_total = sum(not_before_stop.values())
	for w in before_comma:
	comma_score = get_score(w, before_comma, not_before_comma, before_comma_total, not_before_comma_total)
	stop_score = get_score(w, before_stop, not_before_stop, before_stop_total, not_before_stop_total)
	word_rankings[w] = (comma_score, stop_score)

	results = sorted(word_rankings.items(), key=lambda x: x[1][0] - x[1][1], reverse=True)

	if results:
	tmp = []
	tmp.append(fname[:-4])
	for w, stat in results[:5]:
	tmp.append(w)
	tmp.append(round(stat[0]-stat[1]))
	records.append(tmp)

	# Now find the distribution in NT verses for the top-scoring word
	top_marker = results[0][0]
	key = f'{fname[:-4]}-{top_marker}'
	binary_dict[key] = {}
	for verse_n, verse in zip(NT_line_nos, NT_lines):
	binary_dict[key][verse_n] = 1 if top_marker in verse else 0

	result_df = pd.DataFrame.from_records(records, columns=['language', 'no1', 'score', 'no2', 'score', 'no3', 'score', 'no4', 'score', 'no5', 'score'])
	result_df.to_csv('result_first_comma.csv', index=False)
	binary_df = pd.DataFrame(binary_dict)
	binary_df.to_csv('binary_first_comma.csv')