Skip to content

Instantly share code, notes, and snippets.

@oberhamsi
Created July 28, 2010 16:11
Show Gist options
  • Save oberhamsi/495058 to your computer and use it in GitHub Desktop.
Save oberhamsi/495058 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
EXTRA TASK
==============
* nur cod zeilen (und nachfolgende) und außerung darüber, alles andere löschen.
* alles in ein file, headers per file
semantischer inhalt = +C+T
ASV === neu: euro zeile
------------------------
alle auswertungen für:
* nur mutter
* nur len
nur lan
*
Verbtypen
=========
dollar zeilen können unterschieden werden je nach vertyp. es gibt 4 verbtypen:
* VDI, VDV (dynamic motion)
* VMC, VKM (cause motion)
alle auswertungen 3x:
* alle dollar zeilen
* nur dynamic
* nur cause motion
---------------------
dollar zeile
===============
* beginnt mit $:
* endet mit }\n
euro special
===============
wenn nachfolgende dollar zeile euro enthält, zählt sie zu vorheriger dollar zeile dazu.
semantic content var
=====================
sind folgende großbuchstaben
* + davor
* T, M, C, A
* danach: + oder |
Density (dont)
=========================
wieviele unterschiedliche semantic content vars kommen pro dollar zeile vor
V Feld
=======
feld nach $: bis zum nächsten |, aber: V Feld nicht berücksichtigen wenn ASV in dollar zeile vorkommt.
gibt nur eins pro dollar zeile
E Feld
========
beginnt mit :E endet mit |
mehrere pro dollar zeile
Other Devices Feld
===============
das sind:
* alle E Felder
AUSWERTUNG Locus 1
=======
* welche kombination von semconvars kommt in V Feld vor
* welche kombin an semcons kommt in Other Devices Feld vor
ZUSATZ: in E feldern kann auch ein +L vorkommen
( "" ist auch eine - leere - kombination)
pro file: wie oft kommen die kombinationen vor.
AUSWERTUNG Locus 2
====================
In wievielen dollarzeilen kommt ein beliebiges semconvar in
* V Feld AND in Other Devices, +V+X
* in V Feld und NICHT in Other Devices, +V-X
* in Other Devices und NICHT in V Feld, -V+X
* ... -V-X
Focus pro dollar zeile
=======================
welche kombination an semantic content vars kommt pro dollar zeile vor (z.b.: TMC, TAC,..).
AUSERTUNG Focus pro File
==================
wieviele dollar zeilen gibt es für jede semantic content var kombination.
( "" ist auch eine - leere - kombination)
"""
import sys
import csv
SEM_CON_VARS = ['T', 'C', 'M', 'A']
SEM_CON_VARS_L = ['T', 'C', 'M', 'A', 'L']
SEM_CON_VARS.sort()
SEM_CON_VARS_L.sort()
TOKEN_VARS = ['Tbo', 'Tve', 'Tpa', 'Tgo', 'Tso', 'Tdx', 'L', 'M', 'A', 'C']
TOKEN_VARS.sort()
def getTokenFrequenceVE(fields):
total = 0
for field in fields:
for tokenVar in TOKEN_VARS:
total += field.count('+' + tokenVar)
return total * '+'
def getTokenCountsVE(fields):
counts = {}
for field in fields:
for tokenVar in TOKEN_VARS:
k = '+' + tokenVar
if field.find(k) > 1:
if not counts.has_key(k):
counts[k] = 0
counts[k] += 1
return counts
def getTokenCounts(line):
counts = {}
k = ""
for semConVar in SEM_CON_VARS:
count = line.count('+' + semConVar)
if count <=0: continue
k += semConVar * count
if not counts.has_key(k):
counts[k] = 0
counts[k] += 1
return counts
def getTokenFrequence(line):
total = 0
for semConVar in SEM_CON_VARS:
count = line.count('+' + semConVar)
total += count
return total * '+';
def getPatternInFields(fields, includeL = False):
SCV = SEM_CON_VARS
if includeL is True:
SCV = SEM_CON_VARS_L
combi = set()
for field in fields:
for semConVar in SCV:
if field.find('+' + semConVar) > 1:
combi.add(semConVar)
return combi
def setToStr(ss):
ll = list(ss)
ll.sort()
return ''.join(ll)
def getFields(line, fieldType):
fields = line.split(':')
return [field for field in fields if fieldType == '*' or field[0] == fieldType]
#def getOtherDevicePattern(line):
# fields = line.split(':')
# totalCombi = set()
# lookFor = 'E'
# if fields[0][0] == '€':
# lookFor = 'V'
# for f in fields:
# if f[0] == lookFor:
# totalCombi = totalCombi.union(getPatternInField(f))
# return setToStr(totalCombi)
def getAllPattern(line):
fields = line.split(':')
totalCombi = set()
for f in fields:
totalCombi = totalCombi.union(getPatternInField(f))
return setToStr(totalCombi)
def isCodEnd(line):
return line[0] == '*' or line[0:1].lower() == '@' or line[0:4].lower() == '%com' or line[0:1] == '%'
def isCodStart(line):
return line[0:4].lower() == '%cod'
def isUserLine(line):
return line[0] == '*'
def codLines(lines, suser):
inCod = False
user = None
for line in lines:
if isCodStart(line):
inCod = True
elif isCodEnd(line):
inCod = False
if isUserLine(line):
user = line[1:4]
if user == suser and inCod:
yield line
def printSetAsTable(myset):
for key in myset:
print key + "\t" + str(myset[key])
def increase(dct, key):
key = setToStr(key)
if not dct.has_key(key):
dct[key] = 0
dct[key] += 1
def increaseSetCount(total, line):
for k in line:
if not k in total:
total[k] = 0
total[k] += line[k]
return
def increaseCounts(old, new):
for k in new:
if (not k in old):
old[k] = 0
old[k] += new[k]
return
def increaseFreq(d, k):
if not k in d:
d[k] = 0
d[k] += 1
codFile = None
paths = sys.argv[1:]
IGNORE_IF = False
ONLY_SPONTAN = False
ONLY_CAUSE = False
if paths[0] == 'ignoreif':
IGNORE_IF = True
paths = paths[1:]
if paths[0] == 'onlyspontan':
ONLY_SPONTAN = True
paths = paths[1:]
if paths[0] == 'onlycause':
ONLY_CAUSE = True
paths = paths[1:]
codOut = open('c:\\evala\\sim_analyse.cod', 'w')
totalConflationVerbCount = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalConflationOtherDevicesCount = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalConflationOverallCount = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalSemanticContent = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalSemConTokens = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalTokenFrequence = {'LEN': {}, 'MUT': {}, 'JAN': {}}
totalTokensV = {'LEN':{}, 'MUT':{}, 'JAN':{}}
totalTokensE = {'LEN':{}, 'MUT':{}, 'JAN':{}}
totalTokenFrequenceV = {'LEN':{}, 'MUT':{}, 'JAN':{}}
totalTokenFrequenceE ={'LEN':{}, 'MUT':{}, 'JAN':{}}
for COD_PATH in paths:
codFile = open(COD_PATH)
print codFile.name
codOut.write('%com\t\t=======FILE: ' + codFile.name + '\n')
lines = codFile.readlines()
for suser in ['LEN', 'MUT', 'JAN']:
codOut.write('%com\t\t=======\n')
codOut.write('%com\t\t======= USER: ' + suser + '\n')
codOut.write('%com\t\t=======\n')
print ">>> " + suser
conflationVerbCount = {}
conflationOtherDevicesCount = {}
conflationOverallCount = {}
semanticContent = {}
semConTokens = {}
tokenFrequence = {}
tokensV = {}
tokensE = {}
tokenFrequenceV = {}
tokenFrequenceE = {}
for cline in codLines(lines, suser):
## ignore if's if in IGNORE_IF mode
iftest = getFields(cline, '*')
if IGNORE_IF and iftest[-1] and iftest[-1].find('{if') >= 0:
continue
if ONLY_CAUSE and not (cline.count('$:VMC') or cline.count('$:VK')): continue
if ONLY_SPONTAN and not (cline.count('$:VMS') or cline.count('$:VMI')): continue
## locus 1
verbPattern = getPatternInFields(getFields(cline, 'V'))
increase(conflationVerbCount, verbPattern)
increase(totalConflationVerbCount[suser], verbPattern)
otherDevicesPattern = getPatternInFields(getFields(cline, 'E'), True)
increase(conflationOtherDevicesCount, otherDevicesPattern)
increase(totalConflationOtherDevicesCount[suser], otherDevicesPattern)
## focus
allFields = getFields(cline, '*')
overallPattern = getPatternInFields(allFields)
increase(conflationOverallCount, overallPattern)
increase(totalConflationOverallCount[suser], overallPattern)
## locus 2
verbPatternForLocus = getPatternInFields(getFields(cline, 'V'))
otherDevicesPatternForLocus = getPatternInFields(getFields(cline, 'E'))
semCon = ""
if len(verbPatternForLocus) == 0:
semCon = "-V"
else:
semCon = "+V"
if len(otherDevicesPatternForLocus) == 0:
semCon += "-E"
else:
semCon += "+E"
semConPattern = {semCon: 1}
increase(semanticContent, semConPattern)
increase(totalSemanticContent[suser], semConPattern)
#semcontokens
semConTokenCounts = getTokenCounts(cline)
increaseCounts(semConTokens, semConTokenCounts)
increaseCounts(totalSemConTokens[suser], semConTokenCounts)
#tokenFrequence
freq = getTokenFrequence(cline)
increaseFreq(tokenFrequence, freq)
increaseFreq(totalTokenFrequence[suser], freq)
#tokens E und V
toksV = getTokenCountsVE(getFields(cline, 'V'))
increaseSetCount(tokensV, toksV)
increaseSetCount(totalTokensV[suser], toksV)
toksE = getTokenCountsVE(getFields(cline, 'E'))
increaseSetCount(tokensE, toksE)
increaseSetCount(totalTokensE[suser], toksE)
# tokens E und V frqeuence
freqV = getTokenFrequenceVE(getFields(cline, 'V'))
increaseFreq(tokenFrequenceV, freqV)
increaseFreq(totalTokenFrequenceV[suser], freqV)
freqE = getTokenFrequenceVE(getFields(cline, 'E'))
increaseFreq(tokenFrequenceE, freqE)
increaseFreq(totalTokenFrequenceE[suser], freqE)
## out file
codOut.write(cline)
codOut.write('%com\t ' + setToStr(verbPattern) + ' \tverbPattern\n')
codOut.write('%com\t ' + setToStr(otherDevicesPattern) + ' \totherDevicesPattern\n')
codOut.write('%com\t ' + setToStr(overallPattern) + ' \toverallPattern\n')
codOut.write('%com\t ' + setToStr(semConPattern) + ' \tsemConPattern\n')
codOut.write('%com\t ' + setToStr(semConTokens) + ' \tsemConTokens\n')
codOut.write('%com\t ' + setToStr(tokenFrequence) + ' \ttokenFrequence\n')
codOut.write('%com\t ' + setToStr(toksE) + ' \ttokensE\n')
codOut.write('%com\t ' + setToStr(toksV) + ' \ttokensV\n')
print "conflationVerbCount "
printSetAsTable (conflationVerbCount)
print "conflationOtherDevicesCount "
printSetAsTable(conflationOtherDevicesCount)
print "conflationOverallCount "
printSetAsTable(conflationOverallCount)
print "semanticContentCount "
printSetAsTable(semanticContent)
print "semanticContentTokens "
printSetAsTable(semConTokens)
print "tokenfrequence "
printSetAsTable(tokenFrequence)
print "tokensE"
printSetAsTable(tokensE)
print "tokensV"
printSetAsTable(tokensV)
print "=============================="
codFile.close()
codOut.close()
print "========= S U M O F A L L F I L E S ====================="
print "IGNORE IF \t" + str(IGNORE_IF)
print "ONLY CAUSE \t" + str(ONLY_CAUSE)
print "ONLY SPONTAN \t" + str(ONLY_SPONTAN)
print "==============================================="
for suser in ['LEN', 'MUT', 'JAN']:
print suser
print "total conflationVerbCount "
printSetAsTable(totalConflationVerbCount[suser])
print "total otherDevicesPattern "
printSetAsTable(totalConflationOtherDevicesCount[suser])
print "total overallCount "
printSetAsTable(totalConflationOverallCount[suser])
print "total semantic content count "
printSetAsTable(totalSemanticContent[suser])
print "total semantic content tokens "
printSetAsTable(totalSemConTokens[suser])
print "total token frequence "
printSetAsTable(totalTokenFrequence[suser])
print "tokensE"
printSetAsTable(totalTokensE[suser])
print "tokensV"
printSetAsTable(totalTokensV[suser])
print "tokenFrequenceE"
printSetAsTable(totalTokenFrequenceE[suser])
print "tokenFrqeuneceV"
printSetAsTable(totalTokenFrequenceV[suser])
print "-----------------------------"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment