This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk, timeit | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| from nltk.corpus import wordnet | |
| from joblib import Parallel, delayed | |
| lmtzr = WordNetLemmatizer() | |
| tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV} | |
| def get_wordnet_pos(treebank_tag): | |
| return tag_to_type.get(treebank_tag[:1], wordnet.NOUN) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Run models to find natural number of topics | |
| kl = [] | |
| l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus]) | |
| num = range(1,150,1) | |
| for i in num: | |
| lda = models.ldamodel.LdaModel(corpus=my_corpus, | |
| id2word=dictionary,num_topics=i) | |
| #Topic-word matrix | |
| m1 = lda.expElogbeta | |
| U,cm1,V = np.linalg.svd(m1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| awk 'NR>1{print;}' abstracts.txt | awk 'NR%2==0{print;}' | awk -F ',' '{print $1}' > abstractids.txt | |
| awk 'NR>1{print;}' abstracts.txt | awk 'NR%2==0{print;}' | sed 's/ and / /g' | sed 's/ the / /g' | sed 's/ who / /g' | sed 's/ of / /g' | sed 's/ then / /g' | sed 's/ where / /g' | sed 's/<IMAGE>/ /g' | sed 's/ that / /g' | sed 's/[^a-zA-Z ]//g' | sed -e 's/ [a-zA-Z]\{1,2\}\b//g' | sed 's/ / /g' | sed 's/A / /g' | sed 's/The / /g' | sed 's/ The / /g' | sed 's/ Thus / /g' | sed 's/ are / /g' | sed 's/ one / /g' | sed 's/ two / /g' | sed 's/ three / /g' | sed 's/ four / /g' | sed 's/ five / /g' | sed 's/ six / /g' | sed 's/ seven / /g' | sed 's/ eight / /g' | sed 's/ nine / /g' | sed 's/ per / /g' | sed 's/ iii / /g' | sed 's/ III / /g' | sed 's/ may / /g' | sed -n '/[^[:upper:] ]/p' | tr '[:upper:]' '[:lower:]' | sed 's/ have / /g' | sed 's/ from / /g' | sed 's/ for / /g' | sed 's/ not / /g' | sed 's/ will / /g' | sed 's/ with / /g' | sed 's/ such / /g' | sed 's/ between / /g' | sed 's/ from / /g' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| pattern=re.compile(r'[^1-9 ]') | |
| ids = [] | |
| with open('abstractids.txt','r') as f: | |
| for line in f: | |
| line = pattern.sub('',line) | |
| ids.append(line) | |
| with open('abstractsfinal.txt','r') as f: | |
| i = 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| for i in {1..250000} | |
| do | |
| nohup hadoop jar ./Mr.LDA/target/mrlda-0.9.0-SNAPSHOT-fatjar.jar \ | |
| cc.mrlda.VariationalInference \ | |
| -input abstracts/document -output abstracts-lda \ | |
| -term 100000 -topic $i -mapper 500 -reducer 276 >& lda.log & | |
| done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib2 import urlopen | |
| from json import load | |
| import re, nltk | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| from nltk.corpus import wordnet, stopwords | |
| import logging | |
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | |
| level=logging.INFO) | |
| from gensim import corpora, models, similarities, matutils | |
| import numpy as np |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib | |
| import csv | |
| import StringIO | |
| import pandas as pd | |
| import zipfile, os.path | |
| url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw0' | |
| for i in range(3,9): | |
| full_url = url + str(i) + '.zip' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib | |
| import csv | |
| import StringIO | |
| import pandas as pd | |
| import os | |
| import zipfile, os.path | |
| url = 'http://www.bls.gov/cex/pumd/data/comma/intrvw' | |
| for i in range(3,9): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| //REQUIRED: | |
| "bibtex_file_path": "~/Dropbox/Papers/main.bib", | |
| //OPTIONAL: | |
| //By default Citer Search looks for your keyword in the | |
| //author, title, year, and Citekey (id) feilds | |
| "search_fields": ["author", "title", "year", "id"] , |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| //REQUIRED: | |
| "bibtex_file_path": "~/Dropbox/Papers/main.bib", | |
| //OPTIONAL: | |
| //By default Citer Search looks for your keyword in the | |
| //author, title, year, and Citekey (id) feilds | |
| "search_fields": ["author", "title", "year", "id"] , |