Wizek · August 8, 2011 22:33
diff --git a/frequency_2.py b/frequency_2.py
 def progress(str):
    print('Info: '+str)

 progress('Initializing...')

 import csv
 import re
 import operator

 # Global constants
 DICTIONARY_PATH = 'cmavo.csv'
 SAMPLE_TEXT = 'grep_results.txt'

 # Define some variables used later globally
 cmavoList = []
 _tmp = []
 cmavoRegExps = {}

 sampleText = """u'i nai"""
 """
 ua
 u'i
 oi
 ia 
 u'i 
 uinairu'e 
 uinairu'esai
 uinaidai
 ei 
 u'i
 a'o
 u'i 
 ui 
 oi
 u'e nai 
 u'e nai
 ie
 u'e nai 
 ui 
 uisai
 u'i
 u'i 
 ua sai 
 i'e 
 i'o
 uo
 uo
 ia
 ue 
 ue 
 ii 
 i'i
 ui
 ui nai 
 uinai
 ui
 ui
 ui 
 uinai 
 ua
 ui
 ua
 u'i
 e'enai
 ua 
 ui 
 ua
 ui
 ei 
 o'u bu'o 
 a'o 
 ei 
 u'u
 a'ucu'idai
 """

 #
 # Open the file where we have the sample for the keywords
 csvReader = csv.reader(open(DICTIONARY_PATH, 'r'), delimiter=';', quotechar='"')
 for row in csvReader:
    # Read in all the lines
    _tmp.append(row)
 # Shorthand
 header = _tmp[0]

 # Build inner data structure of the table
 #
 # Iterate over all the rows, getting the line number proper
 for rowIdx, row in enumerate(_tmp[1:]):
    # Add a dict to the list
    cmavoList.append({'freq':0})
    # Iterate over each cell in the row
    for colIdx, cell, in enumerate(row):
        # Shorthand to access current header name
        colName = header[colIdx]
        # If we are in a cell which's column header is 'cmavo'
        if colName == 'cmavo':
            # Remove . before vowels to broaden search later
            cell = re.sub('\.', '', cell)
            # Match for consonant inside the cluster to break there, and let the magic be inserted
            matchFor = '\B(?=[bcdfgjklmnprstvxz])'
            # Allow all kind of grammatical stuff (even hesitation) to come inbetween the needed words
            # sai, cai, dai, ru'e, re'e, ro'a, ro'e, ro'i, ro'o, ro'u, pei, bo'u, yyy, .yy, ....
            # but not {nai} and {cu'i}, because those are defined
            replaceWith = "(\s|[scd]ai|r[ue]'e|ro'[aeiou]|pei|bu'o|\.+|\.?y+\.?)*"
            cellTornApart = re.sub(matchFor, replaceWith, cell)
            # Add even to the end, not just inbetween, plus ensures not to match partial strings (around)
            cellTornApart = "\\b(?<!')" + cellTornApart + replaceWith + "\\b(?!')"
            # if cell == 'ianai':
            #     print('\n'+cellTornApart)
            # Register an index for row number by cmavoRegEx
            cmavoRegExps[cellTornApart] = rowIdx
        # Add the cell to the proper row by the proper header
        cmavoList[rowIdx][colName] = cell

 # print(cmavoRegExps[])

 sampleTextArray = sampleText.split('\n')
 for i, line in enumerate(sampleTextArray):
    progress('processing line %i out of %i total'%(i+1, len(sampleTextArray)))
    for regEx in cmavoRegExps:
        _tmp = cmavoList[cmavoRegExps[regEx]]
        # Complicated...
        numberOfMatches = ((len(re.split(regEx, line)))-1)/2
        #if numberOfMatches:
        # print(re.split(regEx, line))
        cmavoList[cmavoRegExps[regEx]]['freq'] += numberOfMatches
        # if _tmp['freq']:
            # print(_tmp['freq'])
 progress('done')

 print(cmavoList[2]['freq'])
 cmavoList.sort(key=operator.itemgetter('freq'))

 for line in cmavoList:
    print("cmavo: %s\t\tFreq: %i"%(line['cmavo'], line['freq']))

 """
 sort regx in cmavoRegExps:
    print(regx)
 #print(cmavoRegExps)

 #
 # Start reading the text sample
 for line in open(SAMPLE_TEXT).readlines():
    pass

 #cmavoList = _tmp[1:]
 #print(cmavoList[0])
 """ 
 """
 print('Counting...\n')
 counts = {}
 f = open('grep_results.txt')
 for line in f.readlines():
    word = line[:-1]
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

    #print([word, counts[word]])
 # for word in counts:
 #     print(x)
 def wizek(k, v):
    return (v, k)
 for key, value in sorted(counts, key=wizek):
    print("%s: %s" % (key, value))
 #print(sorted(counts.items()))
 f.close()
 """
	def progress(str):
	print('Info: '+str)

	progress('Initializing...')

	import csv
	import re
	import operator

	# Global constants
	DICTIONARY_PATH = 'cmavo.csv'
	SAMPLE_TEXT = 'grep_results.txt'

	# Define some variables used later globally
	cmavoList = []
	_tmp = []
	cmavoRegExps = {}

	sampleText = """u'i nai"""
	"""
	ua
	u'i
	oi
	ia
	u'i
	uinairu'e
	uinairu'esai
	uinaidai
	ei
	u'i
	a'o
	u'i
	ui
	oi
	u'e nai
	u'e nai
	ie
	u'e nai
	ui
	uisai
	u'i
	u'i
	ua sai
	i'e
	i'o
	uo
	uo
	ia
	ue
	ue
	ii
	i'i
	ui
	ui nai
	uinai
	ui
	ui
	ui
	uinai
	ua
	ui
	ua
	u'i
	e'enai
	ua
	ui
	ua
	ui
	ei
	o'u bu'o
	a'o
	ei
	u'u
	a'ucu'idai
	"""

	#
	# Open the file where we have the sample for the keywords
	csvReader = csv.reader(open(DICTIONARY_PATH, 'r'), delimiter=';', quotechar='"')
	for row in csvReader:
	# Read in all the lines
	_tmp.append(row)
	# Shorthand
	header = _tmp[0]

	# Build inner data structure of the table
	#
	# Iterate over all the rows, getting the line number proper
	for rowIdx, row in enumerate(_tmp[1:]):
	# Add a dict to the list
	cmavoList.append({'freq':0})
	# Iterate over each cell in the row
	for colIdx, cell, in enumerate(row):
	# Shorthand to access current header name
	colName = header[colIdx]
	# If we are in a cell which's column header is 'cmavo'
	if colName == 'cmavo':
	# Remove . before vowels to broaden search later
	cell = re.sub('\.', '', cell)
	# Match for consonant inside the cluster to break there, and let the magic be inserted
	matchFor = '\B(?=[bcdfgjklmnprstvxz])'
	# Allow all kind of grammatical stuff (even hesitation) to come inbetween the needed words
	# sai, cai, dai, ru'e, re'e, ro'a, ro'e, ro'i, ro'o, ro'u, pei, bo'u, yyy, .yy, ....
	# but not {nai} and {cu'i}, because those are defined
	replaceWith = "(\s\|[scd]ai\|r[ue]'e\|ro'[aeiou]\|pei\|bu'o\|\.+\|\.?y+\.?)*"
	cellTornApart = re.sub(matchFor, replaceWith, cell)
	# Add even to the end, not just inbetween, plus ensures not to match partial strings (around)
	cellTornApart = "\\b(?<!')" + cellTornApart + replaceWith + "\\b(?!')"
	# if cell == 'ianai':
	# print('\n'+cellTornApart)
	# Register an index for row number by cmavoRegEx
	cmavoRegExps[cellTornApart] = rowIdx
	# Add the cell to the proper row by the proper header
	cmavoList[rowIdx][colName] = cell

	# print(cmavoRegExps[])

	sampleTextArray = sampleText.split('\n')
	for i, line in enumerate(sampleTextArray):
	progress('processing line %i out of %i total'%(i+1, len(sampleTextArray)))
	for regEx in cmavoRegExps:
	_tmp = cmavoList[cmavoRegExps[regEx]]
	# Complicated...
	numberOfMatches = ((len(re.split(regEx, line)))-1)/2
	#if numberOfMatches:
	# print(re.split(regEx, line))
	cmavoList[cmavoRegExps[regEx]]['freq'] += numberOfMatches
	# if _tmp['freq']:
	# print(_tmp['freq'])
	progress('done')

	print(cmavoList[2]['freq'])
	cmavoList.sort(key=operator.itemgetter('freq'))

	for line in cmavoList:
	print("cmavo: %s\t\tFreq: %i"%(line['cmavo'], line['freq']))

	"""
	sort regx in cmavoRegExps:
	print(regx)
	#print(cmavoRegExps)

	#
	# Start reading the text sample
	for line in open(SAMPLE_TEXT).readlines():
	pass

	#cmavoList = _tmp[1:]
	#print(cmavoList[0])
	"""
	"""
	print('Counting...\n')
	counts = {}
	f = open('grep_results.txt')
	for line in f.readlines():
	word = line[:-1]
	if word in counts:
	counts[word] += 1
	else:
	counts[word] = 1

	#print([word, counts[word]])
	# for word in counts:
	# print(x)
	def wizek(k, v):
	return (v, k)
	for key, value in sorted(counts, key=wizek):
	print("%s: %s" % (key, value))
	#print(sorted(counts.items()))
	f.close()
	"""