Skip to content

Instantly share code, notes, and snippets.

@macleginn
Created October 2, 2020 09:23
Show Gist options
  • Save macleginn/c4475302a724963c1c2b6af08534dfcf to your computer and use it in GitHub Desktop.
Save macleginn/c4475302a724963c1c2b6af08534dfcf to your computer and use it in GitHub Desktop.
A script for extracting first-comma collocates from the Bible corpus.
import re
import os
from math import log
from collections import Counter
import pandas as pd
def logL(p, k, n):
return k * log(p) + (n - k) * log(1 - p)
def logLikelihoodRatio(k1, n1, k2, n2):
E1 = n1 * (k1 + k2) / (n1 + n2)
E2 = n2 * (k1 + k2) / (n1 + n2)
return 2 * ((k1 * log(k1 / E1)) + (k2 * log(k2 / E2)))
def get_score(w, before_x_counts, not_before_x_counts, before_x_total, not_before_x_total):
count = before_x_counts[w]
k1 = count + 10**(-6) # To avoid the log-of-zero error.
n1 = count + not_before_x_counts[w]
k2 = before_x_total - count + 10**(-6)
n2 = not_before_x_total
return logLikelihoodRatio(k1, n1, k2, n2)
def split_sentences(line):
"""
Splits text by {!, ?, .} and includes them in the output.
Example usage:
In[2]: split_sentences('Hi. What is your name? Nice to meet you!')
Out[2]: ['Hi.', 'What is your name?', 'Nice to meet you!']
"""
indices = []
for idx, c in enumerate(line):
if c in ['.', '!', '?']:
indices.append(idx)
sentences = []
if not indices:
sentences.append(line)
else:
start_idx = 0
for idx in indices:
sentences.append(line[start_idx:(idx+1)].strip())
start_idx = idx+1
suffix = line[start_idx:]
if suffix:
sentences.append(suffix)
return sentences
if __name__ == '__main__':
records = []
binary_dict = {} # Binary vectors indexed by line numbers indicating the presence of putative topic markers in different NT verses.
line_pattern = re.compile(r'(\d+)\s+(.*)')
for fname in os.listdir('../corpus'):
if not fname.endswith('.txt'):
continue
print(fname)
path = f'../corpus/{fname}'
with open(path, 'r', encoding='utf-8') as inp:
lines = inp.readlines()
before_comma = Counter()
not_before_comma = Counter()
before_stop = Counter()
not_before_stop = Counter()
for line in lines:
if line.startswith('40001000'):
break
NT_lines = []
NT_line_nos = []
for line in lines:
m = line_pattern.match(line)
if m:
line_n = m.group(1)
if int(line_n) < 40001001:
continue
line = m.group(2)
NT_line_nos.append(line_n)
NT_lines.append(line.split()) # For looking at by-verse distributions
# Split into sentences, compute collocations with the first comma in each sentence
for sentence in split_sentences(line):
tokens = sentence.split() # The corpus is pretokenised
if len(tokens) <= 1:
continue
first_comma_found = False
for i in range(len(tokens)-1):
w1 = tokens[i]
w2 = tokens[i+1]
if w2 == ',' and not first_comma_found:
first_comma_found = True
before_comma[w1] += 1
else:
not_before_comma[w1] += 1
if w2 == '.':
before_stop[w1] += 1
else:
not_before_stop[w1] += 1
word_rankings = {}
before_comma_total = sum(before_comma.values())
before_stop_total = sum(before_stop.values())
not_before_comma_total = sum(not_before_comma.values())
not_before_stop_total = sum(not_before_stop.values())
for w in before_comma:
comma_score = get_score(w, before_comma, not_before_comma, before_comma_total, not_before_comma_total)
stop_score = get_score(w, before_stop, not_before_stop, before_stop_total, not_before_stop_total)
word_rankings[w] = (comma_score, stop_score)
results = sorted(word_rankings.items(), key=lambda x: x[1][0] - x[1][1], reverse=True)
if results:
tmp = []
tmp.append(fname[:-4])
for w, stat in results[:5]:
tmp.append(w)
tmp.append(round(stat[0]-stat[1]))
records.append(tmp)
# Now find the distribution in NT verses for the top-scoring word
top_marker = results[0][0]
key = f'{fname[:-4]}-{top_marker}'
binary_dict[key] = {}
for verse_n, verse in zip(NT_line_nos, NT_lines):
binary_dict[key][verse_n] = 1 if top_marker in verse else 0
result_df = pd.DataFrame.from_records(records, columns=['language', 'no1', 'score', 'no2', 'score', 'no3', 'score', 'no4', 'score', 'no5', 'score'])
result_df.to_csv('result_first_comma.csv', index=False)
binary_df = pd.DataFrame(binary_dict)
binary_df.to_csv('binary_first_comma.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment