interrogator · August 29, 2015 14:21
diff --git a/gistfile1.py b/gistfile1.py
 def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
    import os
    import nltk
    import re
    from collections import Counter
    import pandas as pd
    # get list of subcorpora
    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    # define risk word
    regex = r'(?i)\brisk'
    # place for our output
    all_data = []
    for corpus in dirs:
        print 'Doing %s ... ' % corpus
        # search the corpus for whole sents containing risk word
        pathed = os.path.join(path, corpus)
        results = !tregex.sh -t -w '/(?i)\brisk.?/' $pathed
        # remove blank lines, etc
        results = [result for result in results if re.search(regex, result)]
        # lowercase
        results = [result.lower() for result in results]
        # make into single string
        results = '\n'.join(results)
        # tokenise the string
        results = nltk.word_tokenize(results)
        # a place for info about each corpus
        dta = [corpus]
        # go left and right depth times
        all_words = []
        for i in range(-depth, (depth + 1)):
            newdict = Counter()
            # exclude the 0 iteration, because it will just be risk words
            print 'Depth: %d...' % i
            matching = []
            # go through each token
            for index, token in enumerate(results):
                # if token matches risk expression
                if re.search(regex, token):
                    # get the word at depth index
                    # try statement for cases where the target word index isn't there
                    try:
                        if i < 0:
                            num = index - abs(i)
                            matching.append(results[num])
                        else:
                            matching.append(results[index + i])
                    except:
                        pass
            # tally results
            counted = Counter(matching)
            # remove punctuation etc
            for key in counted:
                if key.isalnum():
                    newdict[key] = counted[key]
            for w in counted.keys():
                all_words.append(w)
            top_tokens = newdict.most_common(top)

            # so, we now have newdict, containg the data

            # add to our intermediate list
            dta.append([i, top_tokens])
        # add intermediate to main
        all_data.append(dta)
    return all_data
	def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
	import os
	import nltk
	import re
	from collections import Counter
	import pandas as pd
	# get list of subcorpora
	dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
	# define risk word
	regex = r'(?i)\brisk'
	# place for our output
	all_data = []
	for corpus in dirs:
	print 'Doing %s ... ' % corpus
	# search the corpus for whole sents containing risk word
	pathed = os.path.join(path, corpus)
	results = !tregex.sh -t -w '/(?i)\brisk.?/' $pathed
	# remove blank lines, etc
	results = [result for result in results if re.search(regex, result)]
	# lowercase
	results = [result.lower() for result in results]
	# make into single string
	results = '\n'.join(results)
	# tokenise the string
	results = nltk.word_tokenize(results)
	# a place for info about each corpus
	dta = [corpus]
	# go left and right depth times
	all_words = []
	for i in range(-depth, (depth + 1)):
	newdict = Counter()
	# exclude the 0 iteration, because it will just be risk words
	print 'Depth: %d...' % i
	matching = []
	# go through each token
	for index, token in enumerate(results):
	# if token matches risk expression
	if re.search(regex, token):
	# get the word at depth index
	# try statement for cases where the target word index isn't there
	try:
	if i < 0:
	num = index - abs(i)
	matching.append(results[num])
	else:
	matching.append(results[index + i])
	except:
	pass
	# tally results
	counted = Counter(matching)
	# remove punctuation etc
	for key in counted:
	if key.isalnum():
	newdict[key] = counted[key]
	for w in counted.keys():
	all_words.append(w)
	top_tokens = newdict.most_common(top)

	# so, we now have newdict, containg the data

	# add to our intermediate list
	dta.append([i, top_tokens])
	# add intermediate to main
	all_data.append(dta)
	return all_data
No results found