thiagomarzagao · December 18, 2015 07:19
diff --git a/wordscores.py b/wordscores.py
 ### WORDSCORES (LBG-2003)
 ### author: Thiago Marzagao
 ### contact: marzagao ddott 1 at osu ddott edu

 import os
 import numpy as np
 import pandas as pd

 ipath = '/Users/username/inputdata/' # folder containing the CSV files
 opath = '/Users/username/outputdata/' # folder where output will be saved

 # hardcode your reference cases and their scores
 Ar = pd.DataFrame({'referenceCase1': 1.1, # these are just examples
                   'referenceCase2': 3.7, 
                   'referenceCase3': 8.2},
                   index = ['score'])

 # create function to load and merge data
 def loadData(caseSet, path, cols):
    '''
    iterable, string, dict -> pandas.DataFrame
    '''
    output = pd.DataFrame(columns = ['word'])
    for case in caseSet:
    
        # check if case is casename or filename
        if '.csv' not in case:
            case = case + '.csv'

        # load new data file
        newData = pd.read_csv(path + case,
                              usecols = [col for col in cols.keys()],
                              dtype = cols,
                              names = ['word', case.replace('.csv', '')],
                              header = None)

        # merge with previous data
        output = pd.merge(output, newData, on = 'word', how = 'outer')
        output = output.fillna(0) # kill NaNs
                          
    return output

 # load reference data
 Fwr = loadData(Ar.keys(), ipath, {0: 'S30', 2: 'float'})

 # compute p(r|w) = f_wr / sum(f_wr)_{for all r}
 Pwr = Fwr.iloc[:, 1:].div(Fwr.sum(axis = 1), axis = 0)

 # compute Sw and save to file
 Sw = pd.DataFrame(Fwr.word)
 Sw['score'] = Pwr.dot(Ar.T)
 Sw.to_csv(opath + 'wordscores.csv', index = False)

 # load virgin data
 virginSet = [file for file in os.listdir(ipath) 
             if file.replace('.csv', '') not in Ar.keys()]
 virginAbsFreq = loadData(virginSet, ipath, {0: 'S30', 1: 'int'})
 Fwv = loadData(virginSet, ipath, {0: 'S30', 2: 'float'})

 # 1:1 merge Fwv with Sw (to discard all disjoint words)
 temp = pd.merge(Fwv, Sw, on = 'word', how = 'inner')

 # split filtered Sw
 cleanSw = pd.DataFrame(temp.score)

 # clean up filtered Fwv
 del temp['word']
 del temp['score']
 cleanFwv = temp

 # compute Sv = sum(Fwv * Sw)_{for all w}
 Sv = cleanFwv.T.dot(cleanSw)

 # compute transformed Sv
 Sv_t = (Sv - Sv.mean()) * (Ar.T.std() / Sv.std()) + Sv.mean()

 # compute Vv
 Vv = (cleanFwv * np.square((np.array(cleanSw) 
                            - np.array(Sv.T)))).sum(axis = 0)

 # 1:1 merge absolute frequencies with Sw (to discard all disjoint words)
 temp = pd.merge(virginAbsFreq, Sw, on = 'word', how = 'inner')

 # compute N
 del temp['word']
 del temp['score']
 N = temp.sum(axis = 0)

 # compute standard errors and confidence intervals
 std_error = np.sqrt(Vv / N)
 lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
 upper = np.array(Sv).flatten() + np.array((1.96 * std_error))

 # compute transformed confidence intervals
 lower_t = (np.array(lower) - np.array(Sv.mean())) \
          * np.array((Ar.T.std() / Sv.std())) \
          + np.array(Sv.mean())
 upper_t = (np.array(upper) - np.array(Sv.mean())) \
          * np.array((Ar.T.std() / Sv.std())) \
          + np.array(Sv.mean())

 # print everything
 print ''
 print 'Original scores (w/ 95CI):'
 print ''
 Sv['lower'] = lower
 Sv['upper'] = upper
 print Sv
 print ''
 print 'Transformed scores (w/ 95CI):'
 print ''
 Sv_t['lower'] = lower_t
 Sv_t['upper'] = upper_t
 print Sv_t
 print ''

 # save transformed estimates to file
 Sv_t.to_csv(opath + 'virginScores.csv', index_label = 'case')
	### WORDSCORES (LBG-2003)
	### author: Thiago Marzagao
	### contact: marzagao ddott 1 at osu ddott edu

	import os
	import numpy as np
	import pandas as pd

	ipath = '/Users/username/inputdata/' # folder containing the CSV files
	opath = '/Users/username/outputdata/' # folder where output will be saved

	# hardcode your reference cases and their scores
	Ar = pd.DataFrame({'referenceCase1': 1.1, # these are just examples
	'referenceCase2': 3.7,
	'referenceCase3': 8.2},
	index = ['score'])

	# create function to load and merge data
	def loadData(caseSet, path, cols):
	'''
	iterable, string, dict -> pandas.DataFrame
	'''
	output = pd.DataFrame(columns = ['word'])
	for case in caseSet:

	# check if case is casename or filename
	if '.csv' not in case:
	case = case + '.csv'

	# load new data file
	newData = pd.read_csv(path + case,
	usecols = [col for col in cols.keys()],
	dtype = cols,
	names = ['word', case.replace('.csv', '')],
	header = None)

	# merge with previous data
	output = pd.merge(output, newData, on = 'word', how = 'outer')
	output = output.fillna(0) # kill NaNs

	return output

	# load reference data
	Fwr = loadData(Ar.keys(), ipath, {0: 'S30', 2: 'float'})

	# compute p(r\|w) = f_wr / sum(f_wr)_{for all r}
	Pwr = Fwr.iloc[:, 1:].div(Fwr.sum(axis = 1), axis = 0)

	# compute Sw and save to file
	Sw = pd.DataFrame(Fwr.word)
	Sw['score'] = Pwr.dot(Ar.T)
	Sw.to_csv(opath + 'wordscores.csv', index = False)

	# load virgin data
	virginSet = [file for file in os.listdir(ipath)
	if file.replace('.csv', '') not in Ar.keys()]
	virginAbsFreq = loadData(virginSet, ipath, {0: 'S30', 1: 'int'})
	Fwv = loadData(virginSet, ipath, {0: 'S30', 2: 'float'})

	# 1:1 merge Fwv with Sw (to discard all disjoint words)
	temp = pd.merge(Fwv, Sw, on = 'word', how = 'inner')

	# split filtered Sw
	cleanSw = pd.DataFrame(temp.score)

	# clean up filtered Fwv
	del temp['word']
	del temp['score']
	cleanFwv = temp

	# compute Sv = sum(Fwv * Sw)_{for all w}
	Sv = cleanFwv.T.dot(cleanSw)

	# compute transformed Sv
	Sv_t = (Sv - Sv.mean()) * (Ar.T.std() / Sv.std()) + Sv.mean()

	# compute Vv
	Vv = (cleanFwv * np.square((np.array(cleanSw)
	- np.array(Sv.T)))).sum(axis = 0)

	# 1:1 merge absolute frequencies with Sw (to discard all disjoint words)
	temp = pd.merge(virginAbsFreq, Sw, on = 'word', how = 'inner')

	# compute N
	del temp['word']
	del temp['score']
	N = temp.sum(axis = 0)

	# compute standard errors and confidence intervals
	std_error = np.sqrt(Vv / N)
	lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
	upper = np.array(Sv).flatten() + np.array((1.96 * std_error))

	# compute transformed confidence intervals
	lower_t = (np.array(lower) - np.array(Sv.mean())) \
	* np.array((Ar.T.std() / Sv.std())) \
	+ np.array(Sv.mean())
	upper_t = (np.array(upper) - np.array(Sv.mean())) \
	* np.array((Ar.T.std() / Sv.std())) \
	+ np.array(Sv.mean())

	# print everything
	print ''
	print 'Original scores (w/ 95CI):'
	print ''
	Sv['lower'] = lower
	Sv['upper'] = upper
	print Sv
	print ''
	print 'Transformed scores (w/ 95CI):'
	print ''
	Sv_t['lower'] = lower_t
	Sv_t['upper'] = upper_t
	print Sv_t
	print ''

	# save transformed estimates to file
	Sv_t.to_csv(opath + 'virginScores.csv', index_label = 'case')