interrogator · May 19, 2015 13:20
diff --git a/gistfile1.py b/gistfile1.py
 #eugene script

 def eugener(path = 'data/nyt/earlylate', 
            regex = r'(?i)\brisk', 
            depth = 5, 
            top = 10, 
            remove_stopwords = False):
    """ 
    get most frequent words in corpus path to left and right

    """
    import os
    import nltk
    import re
    from StringIO import StringIO
    from collections import Counter
    import pandas as pd
    from dictionaries.stopwords import stopwords as stopwords

    # get list of subcorpora
    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    # define risk word
    # place for our output
    dfs = {}
    for corpus in dirs:
        print 'Doing %s ... ' % corpus
        # search the corpus for whole sents containing risk word
        pathed = os.path.join(path, corpus)
        results = !./tregex.sh -t -w "/$regex/" $pathed
        # remove blank lines, etc
        results = [result for result in results if re.search(regex, result)]
        # lowercase
        results = [result.lower() for result in results]
        # make into single string
        results = '\n'.join(results)
        # tokenise the string
        results = nltk.word_tokenize(results)
        # a place for info about each corpus
        # go left and right depth times
        all_words = []
        dicts = []
        for i in range(-depth, (depth + 1)):
            newdict = Counter()
            # exclude the 0 iteration, because it will just be risk words
            print 'Depth: %d...' % i
            matching = []

            # go through each token
            for index, token in enumerate(results):
                # if token matches risk expression
                if re.search(regex, token):
                    # get the word at depth index
                    # try statement for cases where the target word index isn't there
                    try:
                        if i < 0:
                            num = index - abs(i)
                            matching.append(results[num])
                        else:
                            matching.append(results[index + i])
                    except:
                        pass
            # tally results
            counted = Counter(matching)
            # remove punctuation etc
            for key in counted:
                if key.isalnum():
                    #if key not in stopwords:
                    if remove_stopwords:
                        if key not in stopwords:
                            newdict[key] = counted[key]
                    else:
                        newdict[key] = counted[key]
            for w in counted.keys():
                all_words.append(w)
            #top_tokens = newdict.most_common(top)
            dicts.append(newdict)
        
        # make pandas series
        print 'Making DataFrame ... '
        sers = []
        for word in list(set(all_words)):
            series = [dct[word] for dct in dicts]
            series.append(sum([dct[word] for dct in dicts]))
            index_names = range(-depth, (depth + 1))
            index_names.append('Total')
            ser = pd.Series(series, index = index_names)
            ser.name = word
            sers.append(ser)
        
        # concatenate series
        df = pd.concat(sers, axis=1)
        # remove zero depth
        #df = df.drop(df.index[depth])
        # sort by total
        tot = df.ix['Total']
        df = df[tot.argsort()[::-1]]
        # just top entries
        df = pd.DataFrame(df[list(df.columns)[:top]])
        #transpose
        dfs[corpus] = df.T
    return dfs

 data = eugener(depth = 7, top = 25, remove_stopwords=True)

 import pandas as pd
 pd.set_option('display.max_columns', 500)
 pd.set_option('display.width', 1000)

 print data['1995']

 #            -7  -6  -5  -4  -3  -2   -1     0    1   2   3   4   5   6   7  Total
 # risk       73  59  39  30  21   6    0  4749    0  11  23  26  33  61  72   5203
 # risks      32  22  11  13   6   3    0  1611    0   0   5  12  15  25  30   1785
 # risky      12   8   8   5   4   0    0   684    0   0   1   8   5   9   7    751
 # high        8   8   7   6   4   7  127     0    2   4   8   6   7   5   8    207
 # people     16  17  13  22  24  16    1     0    5  10  24   7  20  13  14    202
 # cancer      4   4   3   1   3   0   47     0    0  26  66  16   9  13   5    197
 # health      6   6   5  10   9   1   91     0    4  14  14   6  11   7  10    194
 # heart       3   3   2   1   2  17    6     0    1  76  38  14   5   5   9    182
 # disease    10   2   4   1   4   0   16     0    0   8  54  42  18   8   6    173
 # risking     1   1   1   0   0   0    0   157    0   0   0   1   2   1   2    166
 # women       6  12  17  22  14   7    0     0    8   5  11  21  20  13   7    163
 # risked      0   0   0   0   0   0    0   141    0   0   0   3   0   3   3    150
 # political   6   6   1   8   8   0   65     0    7  10   6   6   5   5  11    144
 # reduce      3   2   1   3   7  93   22     0    0   0   3   0   1   2   2    139
 # percent    17   7  13  11  12  15    6     0    0   0   9   6  10  17  12    135
 # business    8   3   9  10  10   0    5     0   52   2  11   6   6   8   4    134
 # factors     5   1   3   6   2   0    0     0  110   1   0   3   0   2   1    134
 # losing      2   1   3   2   0   0    0     0   84  26   1   1   2   3   3    128
 # increased   2   2   2   3   1  18   84     0    0   6   1   3   2   1   2    127
 # investors  11  14  14  13  19   1    2     0    4   9  12   8   6   5   6    124
 # greater     4   6   5   4   6   6   63     0    2   3   2   7   7   5   3    123
 # death       1   3   2   2   1   0    6     0    7  29  29  16   7   8   2    113
 # financial   2   7   6   6   2   1   33     0    5  10   6   9   7   6   7    107
 # american   18   8   9   5  11   2    1     0    7  11   8   8   7   6   4    105
 # lives       1   6   6   9   2   7    0     0    3  54   8   5   2   0   2    105 
 
 print data['2013']

 #             -7  -6  -5  -4  -3   -2   -1     0    1    2    3   4   5   6   7  Total
 # risk        93  92  58  47  37   16    1  6591    0   11   33  49  48  77  96   7249
 # risks       32  24  12  17  24    3    0  2217    1    2   22  12  23  29  26   2444
 # risky        4   4   8   1   3    2    0   391    0    2    2   3   6   4   2    432
 # cancer      10   9  20  16   3    3   39     0    1   35  113  38  29  19  11    346
 # people      18  24  18  43  43   32    1     0   10   19   23  17  27  22  24    321
 # heart        8  15  14   8   3   18   14     0    0  119   32  14  10   9   5    269
 # high         9   7   6   6   8    8  136     0    0   17   13  12  13   7  22    264
 # increased    3   6   5   1   3   40  160     0    0    3    4   8   5   7   4    249
 # disease      7   9  13   7   4    2   21     0    0    9  102  41  17   7   9    248
 # health      20   7  13   9  10    7  110     0    1   11   17   9  11   6   7    238
 # risked       0   0   0   0   0    0    0   213    0    0    3   2   2   2   6    228
 # percent     12  17   9  14   5   44   38     0    0    0   11  12  20  15  26    223
 # reduce       3   2   2  10  10  125   32     0    0    0    0   1   5   2   0    192
 # factors      2   1   8   4   6    1    0     0  144    0    3   0   4   6   3    182
 # financial    8  10   9  15   6    4   51     0    5   10   21   6  16  14   7    182
 # higher      12   1   9   2  16    8   82     0    2    8    5   7  12  11   2    177
 # risking      4   2   1   0   0    0    0   165    0    0    0   0   1   3   0    176
 # potential    3   5   7   6   3   20   73     0    1   14   10   8   8   5   5    168
 # women       14   8  23  17   7   13    0     0    5   10    8   9  16  14  15    159
 # bank        15  11  13   9  15   18    5     0    1    7   12  16  10  10  13    155
 # greater      3   3   5   4   7   20   85     0    1    5    3   5   4   3   4    152
 # political    5   3   5   7   4    0   90     0    0   10    5   8   1   7   6    151
 # banks       10  17  13  14  11    8    1     0    0   12   10  17  17  11   9    150
 # management   4   6   1   0   2    4    0     0  100    0   12   5   5   3   6    148
 # breast       4  11  10   3   1   14    0     0    0   60   21   6   7   5   5    147
 
 
 print data['2013'].to_latex()

 # \begin{tabular}{lrrrrrrrrrrrrrrrr}
 # \toprule
 # {} &  -7 &  -6 &  -5 &  -4 &  -3 &   -2 &   -1 &     0 &    1 &    2 &    3 &   4 &   5 &   6 &   7 &  Total \\
 # \midrule
 # risk       &  93 &  92 &  58 &  47 &  37 &   16 &    1 &  6591 &    0 &   11 &   33 &  49 &  48 &  77 &  96 &   7249 \\
 # risks      &  32 &  24 &  12 &  17 &  24 &    3 &    0 &  2217 &    1 &    2 &   22 &  12 &  23 &  29 &  26 &   2444 \\
 # risky      &   4 &   4 &   8 &   1 &   3 &    2 &    0 &   391 &    0 &    2 &    2 &   3 &   6 &   4 &   2 &    432 \\
 # cancer     &  10 &   9 &  20 &  16 &   3 &    3 &   39 &     0 &    1 &   35 &  113 &  38 &  29 &  19 &  11 &    346 \\
 # people     &  18 &  24 &  18 &  43 &  43 &   32 &    1 &     0 &   10 &   19 &   23 &  17 &  27 &  22 &  24 &    321 \\
 # heart      &   8 &  15 &  14 &   8 &   3 &   18 &   14 &     0 &    0 &  119 &   32 &  14 &  10 &   9 &   5 &    269 \\
 # high       &   9 &   7 &   6 &   6 &   8 &    8 &  136 &     0 &    0 &   17 &   13 &  12 &  13 &   7 &  22 &    264 \\
 # increased  &   3 &   6 &   5 &   1 &   3 &   40 &  160 &     0 &    0 &    3 &    4 &   8 &   5 &   7 &   4 &    249 \\
 # disease    &   7 &   9 &  13 &   7 &   4 &    2 &   21 &     0 &    0 &    9 &  102 &  41 &  17 &   7 &   9 &    248 \\
 # health     &  20 &   7 &  13 &   9 &  10 &    7 &  110 &     0 &    1 &   11 &   17 &   9 &  11 &   6 &   7 &    238 \\
 # risked     &   0 &   0 &   0 &   0 &   0 &    0 &    0 &   213 &    0 &    0 &    3 &   2 &   2 &   2 &   6 &    228 \\
 # percent    &  12 &  17 &   9 &  14 &   5 &   44 &   38 &     0 &    0 &    0 &   11 &  12 &  20 &  15 &  26 &    223 \\
 # reduce     &   3 &   2 &   2 &  10 &  10 &  125 &   32 &     0 &    0 &    0 &    0 &   1 &   5 &   2 &   0 &    192 \\
 # factors    &   2 &   1 &   8 &   4 &   6 &    1 &    0 &     0 &  144 &    0 &    3 &   0 &   4 &   6 &   3 &    182 \\
 # financial  &   8 &  10 &   9 &  15 &   6 &    4 &   51 &     0 &    5 &   10 &   21 &   6 &  16 &  14 &   7 &    182 \\
 # higher     &  12 &   1 &   9 &   2 &  16 &    8 &   82 &     0 &    2 &    8 &    5 &   7 &  12 &  11 &   2 &    177 \\
 # risking    &   4 &   2 &   1 &   0 &   0 &    0 &    0 &   165 &    0 &    0 &    0 &   0 &   1 &   3 &   0 &    176 \\
 # potential  &   3 &   5 &   7 &   6 &   3 &   20 &   73 &     0 &    1 &   14 &   10 &   8 &   8 &   5 &   5 &    168 \\
 # women      &  14 &   8 &  23 &  17 &   7 &   13 &    0 &     0 &    5 &   10 &    8 &   9 &  16 &  14 &  15 &    159 \\
 # bank       &  15 &  11 &  13 &   9 &  15 &   18 &    5 &     0 &    1 &    7 &   12 &  16 &  10 &  10 &  13 &    155 \\
 # greater    &   3 &   3 &   5 &   4 &   7 &   20 &   85 &     0 &    1 &    5 &    3 &   5 &   4 &   3 &   4 &    152 \\
 # political  &   5 &   3 &   5 &   7 &   4 &    0 &   90 &     0 &    0 &   10 &    5 &   8 &   1 &   7 &   6 &    151 \\
 # banks      &  10 &  17 &  13 &  14 &  11 &    8 &    1 &     0 &    0 &   12 &   10 &  17 &  17 &  11 &   9 &    150 \\
 # management &   4 &   6 &   1 &   0 &   2 &    4 &    0 &     0 &  100 &    0 &   12 &   5 &   5 &   3 &   6 &    148 \\
 # breast     &   4 &  11 &  10 &   3 &   1 &   14 &    0 &     0 &    0 &   60 &   21 &   6 &   7 &   5 &   5 &    147 \\
 # \bottomrule
 # \end{tabular}
 
 
 print data['2013'].to_csv()

 # ,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,Total
 # risk,93,92,58,47,37,16,1,6591,0,11,33,49,48,77,96,7249
 # risks,32,24,12,17,24,3,0,2217,1,2,22,12,23,29,26,2444
 # risky,4,4,8,1,3,2,0,391,0,2,2,3,6,4,2,432
 # cancer,10,9,20,16,3,3,39,0,1,35,113,38,29,19,11,346
 # people,18,24,18,43,43,32,1,0,10,19,23,17,27,22,24,321
 # heart,8,15,14,8,3,18,14,0,0,119,32,14,10,9,5,269
 # high,9,7,6,6,8,8,136,0,0,17,13,12,13,7,22,264
 # increased,3,6,5,1,3,40,160,0,0,3,4,8,5,7,4,249
 # disease,7,9,13,7,4,2,21,0,0,9,102,41,17,7,9,248
 # health,20,7,13,9,10,7,110,0,1,11,17,9,11,6,7,238
 # risked,0,0,0,0,0,0,0,213,0,0,3,2,2,2,6,228
 # percent,12,17,9,14,5,44,38,0,0,0,11,12,20,15,26,223
 # reduce,3,2,2,10,10,125,32,0,0,0,0,1,5,2,0,192
 # factors,2,1,8,4,6,1,0,0,144,0,3,0,4,6,3,182
 # financial,8,10,9,15,6,4,51,0,5,10,21,6,16,14,7,182
 # higher,12,1,9,2,16,8,82,0,2,8,5,7,12,11,2,177
 # risking,4,2,1,0,0,0,0,165,0,0,0,0,1,3,0,176
 # potential,3,5,7,6,3,20,73,0,1,14,10,8,8,5,5,168
 # women,14,8,23,17,7,13,0,0,5,10,8,9,16,14,15,159
 # bank,15,11,13,9,15,18,5,0,1,7,12,16,10,10,13,155
 # greater,3,3,5,4,7,20,85,0,1,5,3,5,4,3,4,152
 # political,5,3,5,7,4,0,90,0,0,10,5,8,1,7,6,151
 # banks,10,17,13,14,11,8,1,0,0,12,10,17,17,11,9,150
 # management,4,6,1,0,2,4,0,0,100,0,12,5,5,3,6,148
 # breast,4,11,10,3,1,14,0,0,0,60,21,6,7,5,5,147
	#eugene script

	def eugener(path = 'data/nyt/earlylate',
	regex = r'(?i)\brisk',
	depth = 5,
	top = 10,
	remove_stopwords = False):
	"""
	get most frequent words in corpus path to left and right

	"""
	import os
	import nltk
	import re
	from StringIO import StringIO
	from collections import Counter
	import pandas as pd
	from dictionaries.stopwords import stopwords as stopwords

	# get list of subcorpora
	dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
	# define risk word
	# place for our output
	dfs = {}
	for corpus in dirs:
	print 'Doing %s ... ' % corpus
	# search the corpus for whole sents containing risk word
	pathed = os.path.join(path, corpus)
	results = !./tregex.sh -t -w "/$regex/" $pathed
	# remove blank lines, etc
	results = [result for result in results if re.search(regex, result)]
	# lowercase
	results = [result.lower() for result in results]
	# make into single string
	results = '\n'.join(results)
	# tokenise the string
	results = nltk.word_tokenize(results)
	# a place for info about each corpus
	# go left and right depth times
	all_words = []
	dicts = []
	for i in range(-depth, (depth + 1)):
	newdict = Counter()
	# exclude the 0 iteration, because it will just be risk words
	print 'Depth: %d...' % i
	matching = []

	# go through each token
	for index, token in enumerate(results):
	# if token matches risk expression
	if re.search(regex, token):
	# get the word at depth index
	# try statement for cases where the target word index isn't there
	try:
	if i < 0:
	num = index - abs(i)
	matching.append(results[num])
	else:
	matching.append(results[index + i])
	except:
	pass
	# tally results
	counted = Counter(matching)
	# remove punctuation etc
	for key in counted:
	if key.isalnum():
	#if key not in stopwords:
	if remove_stopwords:
	if key not in stopwords:
	newdict[key] = counted[key]
	else:
	newdict[key] = counted[key]
	for w in counted.keys():
	all_words.append(w)
	#top_tokens = newdict.most_common(top)
	dicts.append(newdict)

	# make pandas series
	print 'Making DataFrame ... '
	sers = []
	for word in list(set(all_words)):
	series = [dct[word] for dct in dicts]
	series.append(sum([dct[word] for dct in dicts]))
	index_names = range(-depth, (depth + 1))
	index_names.append('Total')
	ser = pd.Series(series, index = index_names)
	ser.name = word
	sers.append(ser)

	# concatenate series
	df = pd.concat(sers, axis=1)
	# remove zero depth
	#df = df.drop(df.index[depth])
	# sort by total
	tot = df.ix['Total']
	df = df[tot.argsort()[::-1]]
	# just top entries
	df = pd.DataFrame(df[list(df.columns)[:top]])
	#transpose
	dfs[corpus] = df.T
	return dfs

	data = eugener(depth = 7, top = 25, remove_stopwords=True)

	import pandas as pd
	pd.set_option('display.max_columns', 500)
	pd.set_option('display.width', 1000)

	print data['1995']

	# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total
	# risk 73 59 39 30 21 6 0 4749 0 11 23 26 33 61 72 5203
	# risks 32 22 11 13 6 3 0 1611 0 0 5 12 15 25 30 1785
	# risky 12 8 8 5 4 0 0 684 0 0 1 8 5 9 7 751
	# high 8 8 7 6 4 7 127 0 2 4 8 6 7 5 8 207
	# people 16 17 13 22 24 16 1 0 5 10 24 7 20 13 14 202
	# cancer 4 4 3 1 3 0 47 0 0 26 66 16 9 13 5 197
	# health 6 6 5 10 9 1 91 0 4 14 14 6 11 7 10 194
	# heart 3 3 2 1 2 17 6 0 1 76 38 14 5 5 9 182
	# disease 10 2 4 1 4 0 16 0 0 8 54 42 18 8 6 173
	# risking 1 1 1 0 0 0 0 157 0 0 0 1 2 1 2 166
	# women 6 12 17 22 14 7 0 0 8 5 11 21 20 13 7 163
	# risked 0 0 0 0 0 0 0 141 0 0 0 3 0 3 3 150
	# political 6 6 1 8 8 0 65 0 7 10 6 6 5 5 11 144
	# reduce 3 2 1 3 7 93 22 0 0 0 3 0 1 2 2 139
	# percent 17 7 13 11 12 15 6 0 0 0 9 6 10 17 12 135
	# business 8 3 9 10 10 0 5 0 52 2 11 6 6 8 4 134
	# factors 5 1 3 6 2 0 0 0 110 1 0 3 0 2 1 134
	# losing 2 1 3 2 0 0 0 0 84 26 1 1 2 3 3 128
	# increased 2 2 2 3 1 18 84 0 0 6 1 3 2 1 2 127
	# investors 11 14 14 13 19 1 2 0 4 9 12 8 6 5 6 124
	# greater 4 6 5 4 6 6 63 0 2 3 2 7 7 5 3 123
	# death 1 3 2 2 1 0 6 0 7 29 29 16 7 8 2 113
	# financial 2 7 6 6 2 1 33 0 5 10 6 9 7 6 7 107
	# american 18 8 9 5 11 2 1 0 7 11 8 8 7 6 4 105
	# lives 1 6 6 9 2 7 0 0 3 54 8 5 2 0 2 105

	print data['2013']

	# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total
	# risk 93 92 58 47 37 16 1 6591 0 11 33 49 48 77 96 7249
	# risks 32 24 12 17 24 3 0 2217 1 2 22 12 23 29 26 2444
	# risky 4 4 8 1 3 2 0 391 0 2 2 3 6 4 2 432
	# cancer 10 9 20 16 3 3 39 0 1 35 113 38 29 19 11 346
	# people 18 24 18 43 43 32 1 0 10 19 23 17 27 22 24 321
	# heart 8 15 14 8 3 18 14 0 0 119 32 14 10 9 5 269
	# high 9 7 6 6 8 8 136 0 0 17 13 12 13 7 22 264
	# increased 3 6 5 1 3 40 160 0 0 3 4 8 5 7 4 249
	# disease 7 9 13 7 4 2 21 0 0 9 102 41 17 7 9 248
	# health 20 7 13 9 10 7 110 0 1 11 17 9 11 6 7 238
	# risked 0 0 0 0 0 0 0 213 0 0 3 2 2 2 6 228
	# percent 12 17 9 14 5 44 38 0 0 0 11 12 20 15 26 223
	# reduce 3 2 2 10 10 125 32 0 0 0 0 1 5 2 0 192
	# factors 2 1 8 4 6 1 0 0 144 0 3 0 4 6 3 182
	# financial 8 10 9 15 6 4 51 0 5 10 21 6 16 14 7 182
	# higher 12 1 9 2 16 8 82 0 2 8 5 7 12 11 2 177
	# risking 4 2 1 0 0 0 0 165 0 0 0 0 1 3 0 176
	# potential 3 5 7 6 3 20 73 0 1 14 10 8 8 5 5 168
	# women 14 8 23 17 7 13 0 0 5 10 8 9 16 14 15 159
	# bank 15 11 13 9 15 18 5 0 1 7 12 16 10 10 13 155
	# greater 3 3 5 4 7 20 85 0 1 5 3 5 4 3 4 152
	# political 5 3 5 7 4 0 90 0 0 10 5 8 1 7 6 151
	# banks 10 17 13 14 11 8 1 0 0 12 10 17 17 11 9 150
	# management 4 6 1 0 2 4 0 0 100 0 12 5 5 3 6 148
	# breast 4 11 10 3 1 14 0 0 0 60 21 6 7 5 5 147


	print data['2013'].to_latex()

	# \begin{tabular}{lrrrrrrrrrrrrrrrr}
	# \toprule
	# {} & -7 & -6 & -5 & -4 & -3 & -2 & -1 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & Total \\
	# \midrule
	# risk & 93 & 92 & 58 & 47 & 37 & 16 & 1 & 6591 & 0 & 11 & 33 & 49 & 48 & 77 & 96 & 7249 \\
	# risks & 32 & 24 & 12 & 17 & 24 & 3 & 0 & 2217 & 1 & 2 & 22 & 12 & 23 & 29 & 26 & 2444 \\
	# risky & 4 & 4 & 8 & 1 & 3 & 2 & 0 & 391 & 0 & 2 & 2 & 3 & 6 & 4 & 2 & 432 \\
	# cancer & 10 & 9 & 20 & 16 & 3 & 3 & 39 & 0 & 1 & 35 & 113 & 38 & 29 & 19 & 11 & 346 \\
	# people & 18 & 24 & 18 & 43 & 43 & 32 & 1 & 0 & 10 & 19 & 23 & 17 & 27 & 22 & 24 & 321 \\
	# heart & 8 & 15 & 14 & 8 & 3 & 18 & 14 & 0 & 0 & 119 & 32 & 14 & 10 & 9 & 5 & 269 \\
	# high & 9 & 7 & 6 & 6 & 8 & 8 & 136 & 0 & 0 & 17 & 13 & 12 & 13 & 7 & 22 & 264 \\
	# increased & 3 & 6 & 5 & 1 & 3 & 40 & 160 & 0 & 0 & 3 & 4 & 8 & 5 & 7 & 4 & 249 \\
	# disease & 7 & 9 & 13 & 7 & 4 & 2 & 21 & 0 & 0 & 9 & 102 & 41 & 17 & 7 & 9 & 248 \\
	# health & 20 & 7 & 13 & 9 & 10 & 7 & 110 & 0 & 1 & 11 & 17 & 9 & 11 & 6 & 7 & 238 \\
	# risked & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 213 & 0 & 0 & 3 & 2 & 2 & 2 & 6 & 228 \\
	# percent & 12 & 17 & 9 & 14 & 5 & 44 & 38 & 0 & 0 & 0 & 11 & 12 & 20 & 15 & 26 & 223 \\
	# reduce & 3 & 2 & 2 & 10 & 10 & 125 & 32 & 0 & 0 & 0 & 0 & 1 & 5 & 2 & 0 & 192 \\
	# factors & 2 & 1 & 8 & 4 & 6 & 1 & 0 & 0 & 144 & 0 & 3 & 0 & 4 & 6 & 3 & 182 \\
	# financial & 8 & 10 & 9 & 15 & 6 & 4 & 51 & 0 & 5 & 10 & 21 & 6 & 16 & 14 & 7 & 182 \\
	# higher & 12 & 1 & 9 & 2 & 16 & 8 & 82 & 0 & 2 & 8 & 5 & 7 & 12 & 11 & 2 & 177 \\
	# risking & 4 & 2 & 1 & 0 & 0 & 0 & 0 & 165 & 0 & 0 & 0 & 0 & 1 & 3 & 0 & 176 \\
	# potential & 3 & 5 & 7 & 6 & 3 & 20 & 73 & 0 & 1 & 14 & 10 & 8 & 8 & 5 & 5 & 168 \\
	# women & 14 & 8 & 23 & 17 & 7 & 13 & 0 & 0 & 5 & 10 & 8 & 9 & 16 & 14 & 15 & 159 \\
	# bank & 15 & 11 & 13 & 9 & 15 & 18 & 5 & 0 & 1 & 7 & 12 & 16 & 10 & 10 & 13 & 155 \\
	# greater & 3 & 3 & 5 & 4 & 7 & 20 & 85 & 0 & 1 & 5 & 3 & 5 & 4 & 3 & 4 & 152 \\
	# political & 5 & 3 & 5 & 7 & 4 & 0 & 90 & 0 & 0 & 10 & 5 & 8 & 1 & 7 & 6 & 151 \\
	# banks & 10 & 17 & 13 & 14 & 11 & 8 & 1 & 0 & 0 & 12 & 10 & 17 & 17 & 11 & 9 & 150 \\
	# management & 4 & 6 & 1 & 0 & 2 & 4 & 0 & 0 & 100 & 0 & 12 & 5 & 5 & 3 & 6 & 148 \\
	# breast & 4 & 11 & 10 & 3 & 1 & 14 & 0 & 0 & 0 & 60 & 21 & 6 & 7 & 5 & 5 & 147 \\
	# \bottomrule
	# \end{tabular}


	print data['2013'].to_csv()

	# ,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,Total
	# risk,93,92,58,47,37,16,1,6591,0,11,33,49,48,77,96,7249
	# risks,32,24,12,17,24,3,0,2217,1,2,22,12,23,29,26,2444
	# risky,4,4,8,1,3,2,0,391,0,2,2,3,6,4,2,432
	# cancer,10,9,20,16,3,3,39,0,1,35,113,38,29,19,11,346
	# people,18,24,18,43,43,32,1,0,10,19,23,17,27,22,24,321
	# heart,8,15,14,8,3,18,14,0,0,119,32,14,10,9,5,269
	# high,9,7,6,6,8,8,136,0,0,17,13,12,13,7,22,264
	# increased,3,6,5,1,3,40,160,0,0,3,4,8,5,7,4,249
	# disease,7,9,13,7,4,2,21,0,0,9,102,41,17,7,9,248
	# health,20,7,13,9,10,7,110,0,1,11,17,9,11,6,7,238
	# risked,0,0,0,0,0,0,0,213,0,0,3,2,2,2,6,228
	# percent,12,17,9,14,5,44,38,0,0,0,11,12,20,15,26,223
	# reduce,3,2,2,10,10,125,32,0,0,0,0,1,5,2,0,192
	# factors,2,1,8,4,6,1,0,0,144,0,3,0,4,6,3,182
	# financial,8,10,9,15,6,4,51,0,5,10,21,6,16,14,7,182
	# higher,12,1,9,2,16,8,82,0,2,8,5,7,12,11,2,177
	# risking,4,2,1,0,0,0,0,165,0,0,0,0,1,3,0,176
	# potential,3,5,7,6,3,20,73,0,1,14,10,8,8,5,5,168
	# women,14,8,23,17,7,13,0,0,5,10,8,9,16,14,15,159
	# bank,15,11,13,9,15,18,5,0,1,7,12,16,10,10,13,155
	# greater,3,3,5,4,7,20,85,0,1,5,3,5,4,3,4,152
	# political,5,3,5,7,4,0,90,0,0,10,5,8,1,7,6,151
	# banks,10,17,13,14,11,8,1,0,0,12,10,17,17,11,9,150
	# management,4,6,1,0,2,4,0,0,100,0,12,5,5,3,6,148
	# breast,4,11,10,3,1,14,0,0,0,60,21,6,7,5,5,147