Daniel interrogator

Python, linguistics, data science, reproducible research

interrogator / gist:cf42d9e3faf44a2be55b

Created May 19, 2015 13:20

eugener-code-and-output

	#eugene script

	def eugener(path = 'data/nyt/earlylate',
	regex = r'(?i)\brisk',
	depth = 5,
	top = 10,
	remove_stopwords = False):
	"""
	get most frequent words in corpus path to left and right

interrogator / gist:9feebf3bb571db23265e

Last active August 29, 2015 14:21

eugener-code

	def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
	import os
	import nltk
	import re
	from collections import Counter
	import pandas as pd
	# get list of subcorpora
	dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
	# define risk word
	regex = r'(?i)\brisk'

interrogator / gist:2466b3f1ee2c94302a24

Created May 19, 2015 11:25

risk data

	1995
	[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]]
	[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]]
	[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]]
	[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]]
	[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]]
	[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]]
	[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110),

interrogator / gist:1466b785567b1affd9e2

Last active August 29, 2015 14:21

	!sudo yum -y install java
	!git clone https://www.github.com/interrogator/risk

	import corpkit
	from corpkit import interrogator, plotter, quickview
	import pandas as pd
	corpus = 'data/nyt/years'
	#immediate sister to left of risk word
	query = r'__ $. /(?i).?\brisk.?/'
	# interrogate, output words only

interrogator / gist:2b3f37cc14712c5964c5

Created May 18, 2015 03:31

treebank sfl conversion

	def parse_sfl(n = 3):
	from bs4 import BeautifulSoup
	import os
	from collections import defaultdict

	# path to xml files
	xmlpath = 'XML'

	# list of sfl categories
	sfl_list = [

interrogator / gist:a9757ae1b4dcaa3bc84a

Last active August 29, 2015 14:18

extract()

interrogator / gist:89eb901f28923ec847fd

Last active February 11, 2024 14:46

visualising a parse tree

	def quicktree(sentence):
	"""Parse a sentence and return a visual representation"""
	from nltk import Tree
	from nltk.draw.util import CanvasFrame
	from nltk.draw import TreeWidget
	from stat_parser import Parser
	from IPython.display import display
	from IPython.display import Image
	parser = Parser()
	parsed = parser.parse(sentence)