Skip to content

Instantly share code, notes, and snippets.

@Wizek
Created August 8, 2011 22:33
Show Gist options
  • Save Wizek/1132918 to your computer and use it in GitHub Desktop.
Save Wizek/1132918 to your computer and use it in GitHub Desktop.
cmavo frequency counter
def progress(str):
print('Info: '+str)
progress('Initializing...')
import csv
import re
import operator
# Global constants
DICTIONARY_PATH = 'cmavo.csv'
SAMPLE_TEXT = 'grep_results.txt'
# Define some variables used later globally
cmavoList = []
_tmp = []
cmavoRegExps = {}
sampleText = """u'i nai"""
"""
ua
u'i
oi
ia
u'i
uinairu'e
uinairu'esai
uinaidai
ei
u'i
a'o
u'i
ui
oi
u'e nai
u'e nai
ie
u'e nai
ui
uisai
u'i
u'i
ua sai
i'e
i'o
uo
uo
ia
ue
ue
ii
i'i
ui
ui nai
uinai
ui
ui
ui
uinai
ua
ui
ua
u'i
e'enai
ua
ui
ua
ui
ei
o'u bu'o
a'o
ei
u'u
a'ucu'idai
"""
#
# Open the file where we have the sample for the keywords
csvReader = csv.reader(open(DICTIONARY_PATH, 'r'), delimiter=';', quotechar='"')
for row in csvReader:
# Read in all the lines
_tmp.append(row)
# Shorthand
header = _tmp[0]
# Build inner data structure of the table
#
# Iterate over all the rows, getting the line number proper
for rowIdx, row in enumerate(_tmp[1:]):
# Add a dict to the list
cmavoList.append({'freq':0})
# Iterate over each cell in the row
for colIdx, cell, in enumerate(row):
# Shorthand to access current header name
colName = header[colIdx]
# If we are in a cell which's column header is 'cmavo'
if colName == 'cmavo':
# Remove . before vowels to broaden search later
cell = re.sub('\.', '', cell)
# Match for consonant inside the cluster to break there, and let the magic be inserted
matchFor = '\B(?=[bcdfgjklmnprstvxz])'
# Allow all kind of grammatical stuff (even hesitation) to come inbetween the needed words
# sai, cai, dai, ru'e, re'e, ro'a, ro'e, ro'i, ro'o, ro'u, pei, bo'u, yyy, .yy, ....
# but not {nai} and {cu'i}, because those are defined
replaceWith = "(\s|[scd]ai|r[ue]'e|ro'[aeiou]|pei|bu'o|\.+|\.?y+\.?)*"
cellTornApart = re.sub(matchFor, replaceWith, cell)
# Add even to the end, not just inbetween, plus ensures not to match partial strings (around)
cellTornApart = "\\b(?<!')" + cellTornApart + replaceWith + "\\b(?!')"
# if cell == 'ianai':
# print('\n'+cellTornApart)
# Register an index for row number by cmavoRegEx
cmavoRegExps[cellTornApart] = rowIdx
# Add the cell to the proper row by the proper header
cmavoList[rowIdx][colName] = cell
# print(cmavoRegExps[])
sampleTextArray = sampleText.split('\n')
for i, line in enumerate(sampleTextArray):
progress('processing line %i out of %i total'%(i+1, len(sampleTextArray)))
for regEx in cmavoRegExps:
_tmp = cmavoList[cmavoRegExps[regEx]]
# Complicated...
numberOfMatches = ((len(re.split(regEx, line)))-1)/2
#if numberOfMatches:
# print(re.split(regEx, line))
cmavoList[cmavoRegExps[regEx]]['freq'] += numberOfMatches
# if _tmp['freq']:
# print(_tmp['freq'])
progress('done')
print(cmavoList[2]['freq'])
cmavoList.sort(key=operator.itemgetter('freq'))
for line in cmavoList:
print("cmavo: %s\t\tFreq: %i"%(line['cmavo'], line['freq']))
"""
sort regx in cmavoRegExps:
print(regx)
#print(cmavoRegExps)
#
# Start reading the text sample
for line in open(SAMPLE_TEXT).readlines():
pass
#cmavoList = _tmp[1:]
#print(cmavoList[0])
"""
"""
print('Counting...\n')
counts = {}
f = open('grep_results.txt')
for line in f.readlines():
word = line[:-1]
if word in counts:
counts[word] += 1
else:
counts[word] = 1
#print([word, counts[word]])
# for word in counts:
# print(x)
def wizek(k, v):
return (v, k)
for key, value in sorted(counts, key=wizek):
print("%s: %s" % (key, value))
#print(sorted(counts.items()))
f.close()
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment