cigrainger’s gists

cigrainger / gist:4503a95d22eeebe609cd

Created June 15, 2014 12:41

	import nltk, timeit
	from nltk.stem.wordnet import WordNetLemmatizer
	from nltk.corpus import wordnet
	from joblib import Parallel, delayed

	lmtzr = WordNetLemmatizer()

	tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
	def get_wordnet_pos(treebank_tag):
	return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)

cigrainger / gist:a0341419de91dc84e301

Created June 22, 2014 13:12

	# Run models to find natural number of topics
	kl = []
	l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus])
	num = range(1,150,1)
	for i in num:
	lda = models.ldamodel.LdaModel(corpus=my_corpus,
	id2word=dictionary,num_topics=i)
	#Topic-word matrix
	m1 = lda.expElogbeta
	U,cm1,V = np.linalg.svd(m1)

cigrainger / gist:07defd0c0ddd73a635e3

Created July 5, 2014 17:00

	#!/bin/bash
	awk 'NR>1{print;}' abstracts.txt \| awk 'NR%2==0{print;}' \| awk -F ',' '{print $1}' > abstractids.txt
	awk 'NR>1{print;}' abstracts.txt \| awk 'NR%2==0{print;}' \| sed 's/ and / /g' \| sed 's/ the / /g' \| sed 's/ who / /g' \| sed 's/ of / /g' \| sed 's/ then / /g' \| sed 's/ where / /g' \| sed 's/<IMAGE>/ /g' \| sed 's/ that / /g' \| sed 's/[^a-zA-Z ]//g' \| sed -e 's/ [a-zA-Z]\{1,2\}\b//g' \| sed 's/ / /g' \| sed 's/A / /g' \| sed 's/The / /g' \| sed 's/ The / /g' \| sed 's/ Thus / /g' \| sed 's/ are / /g' \| sed 's/ one / /g' \| sed 's/ two / /g' \| sed 's/ three / /g' \| sed 's/ four / /g' \| sed 's/ five / /g' \| sed 's/ six / /g' \| sed 's/ seven / /g' \| sed 's/ eight / /g' \| sed 's/ nine / /g' \| sed 's/ per / /g' \| sed 's/ iii / /g' \| sed 's/ III / /g' \| sed 's/ may / /g' \| sed -n '/[^[:upper:] ]/p' \| tr '[:upper:]' '[:lower:]' \| sed 's/ have / /g' \| sed 's/ from / /g' \| sed 's/ for / /g' \| sed 's/ not / /g' \| sed 's/ will / /g' \| sed 's/ with / /g' \| sed 's/ such / /g' \| sed 's/ between / /g' \| sed 's/ from / /g'

cigrainger / gist:fe0c66eeb7f97a0838c0

Created July 9, 2014 23:41

	import re
	pattern=re.compile(r'[^1-9 ]')

	ids = []
	with open('abstractids.txt','r') as f:
	for line in f:
	line = pattern.sub('',line)
	ids.append(line)
	with open('abstractsfinal.txt','r') as f:
	i = 0

cigrainger / gist:1aa5ae1aa196f9d4ffd5

Created July 10, 2014 17:48

	#!/bin/bash
	for i in {1..250000}
	do
	nohup hadoop jar ./Mr.LDA/target/mrlda-0.9.0-SNAPSHOT-fatjar.jar \
	cc.mrlda.VariationalInference \
	-input abstracts/document -output abstracts-lda \
	-term 100000 -topic $i -mapper 500 -reducer 276 >& lda.log &
	done

cigrainger / gist:62910e58db46b7397de2

Created July 11, 2014 18:28

Arun et al measure with NPR data

	from urllib2 import urlopen
	from json import load
	import re, nltk
	from nltk.stem.wordnet import WordNetLemmatizer
	from nltk.corpus import wordnet, stopwords
	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
	level=logging.INFO)
	from gensim import corpora, models, similarities, matutils
	import numpy as np

cigrainger / gist:59eb75773ed5d619e0d8

Created July 14, 2014 11:48

	import urllib
	import csv
	import StringIO
	import pandas as pd
	import zipfile, os.path

	url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw0'

	for i in range(3,9):
	full_url = url + str(i) + '.zip'

cigrainger / gist:639395a34b01736be3b0

Created July 14, 2014 12:03

	import urllib
	import csv
	import StringIO
	import pandas as pd
	import os
	import zipfile, os.path

	url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw'

	for i in range(3,9):

cigrainger / gist:68139dccfcdff33b23c9

Created July 23, 2014 20:41

	{
	//REQUIRED:

	"bibtex_file_path": "~/Dropbox/Papers/main.bib",

	//OPTIONAL:

	//By default Citer Search looks for your keyword in the
	//author, title, year, and Citekey (id) feilds
	"search_fields": ["author", "title", "year", "id"] ,

cigrainger / gist:552f897d42abd004065c

Created July 23, 2014 20:41

	{
	//REQUIRED:

	"bibtex_file_path": "~/Dropbox/Papers/main.bib",

	//OPTIONAL:

	//By default Citer Search looks for your keyword in the
	//author, title, year, and Citekey (id) feilds
	"search_fields": ["author", "title", "year", "id"] ,

Christopher Grainger cigrainger