Created
August 8, 2011 22:33
-
-
Save Wizek/1132918 to your computer and use it in GitHub Desktop.
cmavo frequency counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def progress(str): | |
print('Info: '+str) | |
progress('Initializing...') | |
import csv | |
import re | |
import operator | |
# Global constants | |
DICTIONARY_PATH = 'cmavo.csv' | |
SAMPLE_TEXT = 'grep_results.txt' | |
# Define some variables used later globally | |
cmavoList = [] | |
_tmp = [] | |
cmavoRegExps = {} | |
sampleText = """u'i nai""" | |
""" | |
ua | |
u'i | |
oi | |
ia | |
u'i | |
uinairu'e | |
uinairu'esai | |
uinaidai | |
ei | |
u'i | |
a'o | |
u'i | |
ui | |
oi | |
u'e nai | |
u'e nai | |
ie | |
u'e nai | |
ui | |
uisai | |
u'i | |
u'i | |
ua sai | |
i'e | |
i'o | |
uo | |
uo | |
ia | |
ue | |
ue | |
ii | |
i'i | |
ui | |
ui nai | |
uinai | |
ui | |
ui | |
ui | |
uinai | |
ua | |
ui | |
ua | |
u'i | |
e'enai | |
ua | |
ui | |
ua | |
ui | |
ei | |
o'u bu'o | |
a'o | |
ei | |
u'u | |
a'ucu'idai | |
""" | |
# | |
# Open the file where we have the sample for the keywords | |
csvReader = csv.reader(open(DICTIONARY_PATH, 'r'), delimiter=';', quotechar='"') | |
for row in csvReader: | |
# Read in all the lines | |
_tmp.append(row) | |
# Shorthand | |
header = _tmp[0] | |
# Build inner data structure of the table | |
# | |
# Iterate over all the rows, getting the line number proper | |
for rowIdx, row in enumerate(_tmp[1:]): | |
# Add a dict to the list | |
cmavoList.append({'freq':0}) | |
# Iterate over each cell in the row | |
for colIdx, cell, in enumerate(row): | |
# Shorthand to access current header name | |
colName = header[colIdx] | |
# If we are in a cell which's column header is 'cmavo' | |
if colName == 'cmavo': | |
# Remove . before vowels to broaden search later | |
cell = re.sub('\.', '', cell) | |
# Match for consonant inside the cluster to break there, and let the magic be inserted | |
matchFor = '\B(?=[bcdfgjklmnprstvxz])' | |
# Allow all kind of grammatical stuff (even hesitation) to come inbetween the needed words | |
# sai, cai, dai, ru'e, re'e, ro'a, ro'e, ro'i, ro'o, ro'u, pei, bo'u, yyy, .yy, .... | |
# but not {nai} and {cu'i}, because those are defined | |
replaceWith = "(\s|[scd]ai|r[ue]'e|ro'[aeiou]|pei|bu'o|\.+|\.?y+\.?)*" | |
cellTornApart = re.sub(matchFor, replaceWith, cell) | |
# Add even to the end, not just inbetween, plus ensures not to match partial strings (around) | |
cellTornApart = "\\b(?<!')" + cellTornApart + replaceWith + "\\b(?!')" | |
# if cell == 'ianai': | |
# print('\n'+cellTornApart) | |
# Register an index for row number by cmavoRegEx | |
cmavoRegExps[cellTornApart] = rowIdx | |
# Add the cell to the proper row by the proper header | |
cmavoList[rowIdx][colName] = cell | |
# print(cmavoRegExps[]) | |
sampleTextArray = sampleText.split('\n') | |
for i, line in enumerate(sampleTextArray): | |
progress('processing line %i out of %i total'%(i+1, len(sampleTextArray))) | |
for regEx in cmavoRegExps: | |
_tmp = cmavoList[cmavoRegExps[regEx]] | |
# Complicated... | |
numberOfMatches = ((len(re.split(regEx, line)))-1)/2 | |
#if numberOfMatches: | |
# print(re.split(regEx, line)) | |
cmavoList[cmavoRegExps[regEx]]['freq'] += numberOfMatches | |
# if _tmp['freq']: | |
# print(_tmp['freq']) | |
progress('done') | |
print(cmavoList[2]['freq']) | |
cmavoList.sort(key=operator.itemgetter('freq')) | |
for line in cmavoList: | |
print("cmavo: %s\t\tFreq: %i"%(line['cmavo'], line['freq'])) | |
""" | |
sort regx in cmavoRegExps: | |
print(regx) | |
#print(cmavoRegExps) | |
# | |
# Start reading the text sample | |
for line in open(SAMPLE_TEXT).readlines(): | |
pass | |
#cmavoList = _tmp[1:] | |
#print(cmavoList[0]) | |
""" | |
""" | |
print('Counting...\n') | |
counts = {} | |
f = open('grep_results.txt') | |
for line in f.readlines(): | |
word = line[:-1] | |
if word in counts: | |
counts[word] += 1 | |
else: | |
counts[word] = 1 | |
#print([word, counts[word]]) | |
# for word in counts: | |
# print(x) | |
def wizek(k, v): | |
return (v, k) | |
for key, value in sorted(counts, key=wizek): | |
print("%s: %s" % (key, value)) | |
#print(sorted(counts.items())) | |
f.close() | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment