Skip to content

Instantly share code, notes, and snippets.

import nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
# Run models to find natural number of topics
kl = []
l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus])
num = range(1,150,1)
for i in num:
lda = models.ldamodel.LdaModel(corpus=my_corpus,
id2word=dictionary,num_topics=i)
#Topic-word matrix
m1 = lda.expElogbeta
U,cm1,V = np.linalg.svd(m1)
#!/bin/bash
awk 'NR>1{print;}' abstracts.txt | awk 'NR%2==0{print;}' | awk -F ',' '{print $1}' > abstractids.txt
awk 'NR>1{print;}' abstracts.txt | awk 'NR%2==0{print;}' | sed 's/ and / /g' | sed 's/ the / /g' | sed 's/ who / /g' | sed 's/ of / /g' | sed 's/ then / /g' | sed 's/ where / /g' | sed 's/<IMAGE>/ /g' | sed 's/ that / /g' | sed 's/[^a-zA-Z ]//g' | sed -e 's/ [a-zA-Z]\{1,2\}\b//g' | sed 's/ / /g' | sed 's/A / /g' | sed 's/The / /g' | sed 's/ The / /g' | sed 's/ Thus / /g' | sed 's/ are / /g' | sed 's/ one / /g' | sed 's/ two / /g' | sed 's/ three / /g' | sed 's/ four / /g' | sed 's/ five / /g' | sed 's/ six / /g' | sed 's/ seven / /g' | sed 's/ eight / /g' | sed 's/ nine / /g' | sed 's/ per / /g' | sed 's/ iii / /g' | sed 's/ III / /g' | sed 's/ may / /g' | sed -n '/[^[:upper:] ]/p' | tr '[:upper:]' '[:lower:]' | sed 's/ have / /g' | sed 's/ from / /g' | sed 's/ for / /g' | sed 's/ not / /g' | sed 's/ will / /g' | sed 's/ with / /g' | sed 's/ such / /g' | sed 's/ between / /g' | sed 's/ from / /g'
import re
pattern=re.compile(r'[^1-9 ]')
ids = []
with open('abstractids.txt','r') as f:
for line in f:
line = pattern.sub('',line)
ids.append(line)
with open('abstractsfinal.txt','r') as f:
i = 0
#!/bin/bash
for i in {1..250000}
do
nohup hadoop jar ./Mr.LDA/target/mrlda-0.9.0-SNAPSHOT-fatjar.jar \
cc.mrlda.VariationalInference \
-input abstracts/document -output abstracts-lda \
-term 100000 -topic $i -mapper 500 -reducer 276 >& lda.log &
done
@cigrainger
cigrainger / gist:62910e58db46b7397de2
Created July 11, 2014 18:28
Arun et al measure with NPR data
from urllib2 import urlopen
from json import load
import re, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from gensim import corpora, models, similarities, matutils
import numpy as np
import urllib
import csv
import StringIO
import pandas as pd
import zipfile, os.path
url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw0'
for i in range(3,9):
full_url = url + str(i) + '.zip'
import urllib
import csv
import StringIO
import pandas as pd
import os
import zipfile, os.path
url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw'
for i in range(3,9):
{
//REQUIRED:
"bibtex_file_path": "~/Dropbox/Papers/main.bib",
//OPTIONAL:
//By default Citer Search looks for your keyword in the
//author, title, year, and Citekey (id) feilds
"search_fields": ["author", "title", "year", "id"] ,
{
//REQUIRED:
"bibtex_file_path": "~/Dropbox/Papers/main.bib",
//OPTIONAL:
//By default Citer Search looks for your keyword in the
//author, title, year, and Citekey (id) feilds
"search_fields": ["author", "title", "year", "id"] ,