Created
July 28, 2010 16:11
-
-
Save oberhamsi/495058 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
EXTRA TASK | |
============== | |
* nur cod zeilen (und nachfolgende) und außerung darüber, alles andere löschen. | |
* alles in ein file, headers per file | |
semantischer inhalt = +C+T | |
ASV === neu: euro zeile | |
------------------------ | |
alle auswertungen für: | |
* nur mutter | |
* nur len | |
nur lan | |
* | |
Verbtypen | |
========= | |
dollar zeilen können unterschieden werden je nach vertyp. es gibt 4 verbtypen: | |
* VDI, VDV (dynamic motion) | |
* VMC, VKM (cause motion) | |
alle auswertungen 3x: | |
* alle dollar zeilen | |
* nur dynamic | |
* nur cause motion | |
--------------------- | |
dollar zeile | |
=============== | |
* beginnt mit $: | |
* endet mit }\n | |
euro special | |
=============== | |
wenn nachfolgende dollar zeile euro enthält, zählt sie zu vorheriger dollar zeile dazu. | |
semantic content var | |
===================== | |
sind folgende großbuchstaben | |
* + davor | |
* T, M, C, A | |
* danach: + oder | | |
Density (dont) | |
========================= | |
wieviele unterschiedliche semantic content vars kommen pro dollar zeile vor | |
V Feld | |
======= | |
feld nach $: bis zum nächsten |, aber: V Feld nicht berücksichtigen wenn ASV in dollar zeile vorkommt. | |
gibt nur eins pro dollar zeile | |
E Feld | |
======== | |
beginnt mit :E endet mit | | |
mehrere pro dollar zeile | |
Other Devices Feld | |
=============== | |
das sind: | |
* alle E Felder | |
AUSWERTUNG Locus 1 | |
======= | |
* welche kombination von semconvars kommt in V Feld vor | |
* welche kombin an semcons kommt in Other Devices Feld vor | |
ZUSATZ: in E feldern kann auch ein +L vorkommen | |
( "" ist auch eine - leere - kombination) | |
pro file: wie oft kommen die kombinationen vor. | |
AUSWERTUNG Locus 2 | |
==================== | |
In wievielen dollarzeilen kommt ein beliebiges semconvar in | |
* V Feld AND in Other Devices, +V+X | |
* in V Feld und NICHT in Other Devices, +V-X | |
* in Other Devices und NICHT in V Feld, -V+X | |
* ... -V-X | |
Focus pro dollar zeile | |
======================= | |
welche kombination an semantic content vars kommt pro dollar zeile vor (z.b.: TMC, TAC,..). | |
AUSERTUNG Focus pro File | |
================== | |
wieviele dollar zeilen gibt es für jede semantic content var kombination. | |
( "" ist auch eine - leere - kombination) | |
""" | |
import sys | |
import csv | |
SEM_CON_VARS = ['T', 'C', 'M', 'A'] | |
SEM_CON_VARS_L = ['T', 'C', 'M', 'A', 'L'] | |
SEM_CON_VARS.sort() | |
SEM_CON_VARS_L.sort() | |
TOKEN_VARS = ['Tbo', 'Tve', 'Tpa', 'Tgo', 'Tso', 'Tdx', 'L', 'M', 'A', 'C'] | |
TOKEN_VARS.sort() | |
def getTokenFrequenceVE(fields): | |
total = 0 | |
for field in fields: | |
for tokenVar in TOKEN_VARS: | |
total += field.count('+' + tokenVar) | |
return total * '+' | |
def getTokenCountsVE(fields): | |
counts = {} | |
for field in fields: | |
for tokenVar in TOKEN_VARS: | |
k = '+' + tokenVar | |
if field.find(k) > 1: | |
if not counts.has_key(k): | |
counts[k] = 0 | |
counts[k] += 1 | |
return counts | |
def getTokenCounts(line): | |
counts = {} | |
k = "" | |
for semConVar in SEM_CON_VARS: | |
count = line.count('+' + semConVar) | |
if count <=0: continue | |
k += semConVar * count | |
if not counts.has_key(k): | |
counts[k] = 0 | |
counts[k] += 1 | |
return counts | |
def getTokenFrequence(line): | |
total = 0 | |
for semConVar in SEM_CON_VARS: | |
count = line.count('+' + semConVar) | |
total += count | |
return total * '+'; | |
def getPatternInFields(fields, includeL = False): | |
SCV = SEM_CON_VARS | |
if includeL is True: | |
SCV = SEM_CON_VARS_L | |
combi = set() | |
for field in fields: | |
for semConVar in SCV: | |
if field.find('+' + semConVar) > 1: | |
combi.add(semConVar) | |
return combi | |
def setToStr(ss): | |
ll = list(ss) | |
ll.sort() | |
return ''.join(ll) | |
def getFields(line, fieldType): | |
fields = line.split(':') | |
return [field for field in fields if fieldType == '*' or field[0] == fieldType] | |
#def getOtherDevicePattern(line): | |
# fields = line.split(':') | |
# totalCombi = set() | |
# lookFor = 'E' | |
# if fields[0][0] == '€': | |
# lookFor = 'V' | |
# for f in fields: | |
# if f[0] == lookFor: | |
# totalCombi = totalCombi.union(getPatternInField(f)) | |
# return setToStr(totalCombi) | |
def getAllPattern(line): | |
fields = line.split(':') | |
totalCombi = set() | |
for f in fields: | |
totalCombi = totalCombi.union(getPatternInField(f)) | |
return setToStr(totalCombi) | |
def isCodEnd(line): | |
return line[0] == '*' or line[0:1].lower() == '@' or line[0:4].lower() == '%com' or line[0:1] == '%' | |
def isCodStart(line): | |
return line[0:4].lower() == '%cod' | |
def isUserLine(line): | |
return line[0] == '*' | |
def codLines(lines, suser): | |
inCod = False | |
user = None | |
for line in lines: | |
if isCodStart(line): | |
inCod = True | |
elif isCodEnd(line): | |
inCod = False | |
if isUserLine(line): | |
user = line[1:4] | |
if user == suser and inCod: | |
yield line | |
def printSetAsTable(myset): | |
for key in myset: | |
print key + "\t" + str(myset[key]) | |
def increase(dct, key): | |
key = setToStr(key) | |
if not dct.has_key(key): | |
dct[key] = 0 | |
dct[key] += 1 | |
def increaseSetCount(total, line): | |
for k in line: | |
if not k in total: | |
total[k] = 0 | |
total[k] += line[k] | |
return | |
def increaseCounts(old, new): | |
for k in new: | |
if (not k in old): | |
old[k] = 0 | |
old[k] += new[k] | |
return | |
def increaseFreq(d, k): | |
if not k in d: | |
d[k] = 0 | |
d[k] += 1 | |
codFile = None | |
paths = sys.argv[1:] | |
IGNORE_IF = False | |
ONLY_SPONTAN = False | |
ONLY_CAUSE = False | |
if paths[0] == 'ignoreif': | |
IGNORE_IF = True | |
paths = paths[1:] | |
if paths[0] == 'onlyspontan': | |
ONLY_SPONTAN = True | |
paths = paths[1:] | |
if paths[0] == 'onlycause': | |
ONLY_CAUSE = True | |
paths = paths[1:] | |
codOut = open('c:\\evala\\sim_analyse.cod', 'w') | |
totalConflationVerbCount = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalConflationOtherDevicesCount = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalConflationOverallCount = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalSemanticContent = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalSemConTokens = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalTokenFrequence = {'LEN': {}, 'MUT': {}, 'JAN': {}} | |
totalTokensV = {'LEN':{}, 'MUT':{}, 'JAN':{}} | |
totalTokensE = {'LEN':{}, 'MUT':{}, 'JAN':{}} | |
totalTokenFrequenceV = {'LEN':{}, 'MUT':{}, 'JAN':{}} | |
totalTokenFrequenceE ={'LEN':{}, 'MUT':{}, 'JAN':{}} | |
for COD_PATH in paths: | |
codFile = open(COD_PATH) | |
print codFile.name | |
codOut.write('%com\t\t=======FILE: ' + codFile.name + '\n') | |
lines = codFile.readlines() | |
for suser in ['LEN', 'MUT', 'JAN']: | |
codOut.write('%com\t\t=======\n') | |
codOut.write('%com\t\t======= USER: ' + suser + '\n') | |
codOut.write('%com\t\t=======\n') | |
print ">>> " + suser | |
conflationVerbCount = {} | |
conflationOtherDevicesCount = {} | |
conflationOverallCount = {} | |
semanticContent = {} | |
semConTokens = {} | |
tokenFrequence = {} | |
tokensV = {} | |
tokensE = {} | |
tokenFrequenceV = {} | |
tokenFrequenceE = {} | |
for cline in codLines(lines, suser): | |
## ignore if's if in IGNORE_IF mode | |
iftest = getFields(cline, '*') | |
if IGNORE_IF and iftest[-1] and iftest[-1].find('{if') >= 0: | |
continue | |
if ONLY_CAUSE and not (cline.count('$:VMC') or cline.count('$:VK')): continue | |
if ONLY_SPONTAN and not (cline.count('$:VMS') or cline.count('$:VMI')): continue | |
## locus 1 | |
verbPattern = getPatternInFields(getFields(cline, 'V')) | |
increase(conflationVerbCount, verbPattern) | |
increase(totalConflationVerbCount[suser], verbPattern) | |
otherDevicesPattern = getPatternInFields(getFields(cline, 'E'), True) | |
increase(conflationOtherDevicesCount, otherDevicesPattern) | |
increase(totalConflationOtherDevicesCount[suser], otherDevicesPattern) | |
## focus | |
allFields = getFields(cline, '*') | |
overallPattern = getPatternInFields(allFields) | |
increase(conflationOverallCount, overallPattern) | |
increase(totalConflationOverallCount[suser], overallPattern) | |
## locus 2 | |
verbPatternForLocus = getPatternInFields(getFields(cline, 'V')) | |
otherDevicesPatternForLocus = getPatternInFields(getFields(cline, 'E')) | |
semCon = "" | |
if len(verbPatternForLocus) == 0: | |
semCon = "-V" | |
else: | |
semCon = "+V" | |
if len(otherDevicesPatternForLocus) == 0: | |
semCon += "-E" | |
else: | |
semCon += "+E" | |
semConPattern = {semCon: 1} | |
increase(semanticContent, semConPattern) | |
increase(totalSemanticContent[suser], semConPattern) | |
#semcontokens | |
semConTokenCounts = getTokenCounts(cline) | |
increaseCounts(semConTokens, semConTokenCounts) | |
increaseCounts(totalSemConTokens[suser], semConTokenCounts) | |
#tokenFrequence | |
freq = getTokenFrequence(cline) | |
increaseFreq(tokenFrequence, freq) | |
increaseFreq(totalTokenFrequence[suser], freq) | |
#tokens E und V | |
toksV = getTokenCountsVE(getFields(cline, 'V')) | |
increaseSetCount(tokensV, toksV) | |
increaseSetCount(totalTokensV[suser], toksV) | |
toksE = getTokenCountsVE(getFields(cline, 'E')) | |
increaseSetCount(tokensE, toksE) | |
increaseSetCount(totalTokensE[suser], toksE) | |
# tokens E und V frqeuence | |
freqV = getTokenFrequenceVE(getFields(cline, 'V')) | |
increaseFreq(tokenFrequenceV, freqV) | |
increaseFreq(totalTokenFrequenceV[suser], freqV) | |
freqE = getTokenFrequenceVE(getFields(cline, 'E')) | |
increaseFreq(tokenFrequenceE, freqE) | |
increaseFreq(totalTokenFrequenceE[suser], freqE) | |
## out file | |
codOut.write(cline) | |
codOut.write('%com\t ' + setToStr(verbPattern) + ' \tverbPattern\n') | |
codOut.write('%com\t ' + setToStr(otherDevicesPattern) + ' \totherDevicesPattern\n') | |
codOut.write('%com\t ' + setToStr(overallPattern) + ' \toverallPattern\n') | |
codOut.write('%com\t ' + setToStr(semConPattern) + ' \tsemConPattern\n') | |
codOut.write('%com\t ' + setToStr(semConTokens) + ' \tsemConTokens\n') | |
codOut.write('%com\t ' + setToStr(tokenFrequence) + ' \ttokenFrequence\n') | |
codOut.write('%com\t ' + setToStr(toksE) + ' \ttokensE\n') | |
codOut.write('%com\t ' + setToStr(toksV) + ' \ttokensV\n') | |
print "conflationVerbCount " | |
printSetAsTable (conflationVerbCount) | |
print "conflationOtherDevicesCount " | |
printSetAsTable(conflationOtherDevicesCount) | |
print "conflationOverallCount " | |
printSetAsTable(conflationOverallCount) | |
print "semanticContentCount " | |
printSetAsTable(semanticContent) | |
print "semanticContentTokens " | |
printSetAsTable(semConTokens) | |
print "tokenfrequence " | |
printSetAsTable(tokenFrequence) | |
print "tokensE" | |
printSetAsTable(tokensE) | |
print "tokensV" | |
printSetAsTable(tokensV) | |
print "==============================" | |
codFile.close() | |
codOut.close() | |
print "========= S U M O F A L L F I L E S =====================" | |
print "IGNORE IF \t" + str(IGNORE_IF) | |
print "ONLY CAUSE \t" + str(ONLY_CAUSE) | |
print "ONLY SPONTAN \t" + str(ONLY_SPONTAN) | |
print "===============================================" | |
for suser in ['LEN', 'MUT', 'JAN']: | |
print suser | |
print "total conflationVerbCount " | |
printSetAsTable(totalConflationVerbCount[suser]) | |
print "total otherDevicesPattern " | |
printSetAsTable(totalConflationOtherDevicesCount[suser]) | |
print "total overallCount " | |
printSetAsTable(totalConflationOverallCount[suser]) | |
print "total semantic content count " | |
printSetAsTable(totalSemanticContent[suser]) | |
print "total semantic content tokens " | |
printSetAsTable(totalSemConTokens[suser]) | |
print "total token frequence " | |
printSetAsTable(totalTokenFrequence[suser]) | |
print "tokensE" | |
printSetAsTable(totalTokensE[suser]) | |
print "tokensV" | |
printSetAsTable(totalTokensV[suser]) | |
print "tokenFrequenceE" | |
printSetAsTable(totalTokenFrequenceE[suser]) | |
print "tokenFrqeuneceV" | |
printSetAsTable(totalTokenFrequenceV[suser]) | |
print "-----------------------------" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment