-
-
Save sangheestyle/8691435 to your computer and use it in GitHub Desktop.
import os | |
import json | |
import shutil | |
from subprocess import call | |
import cld | |
def read_json(file_path): | |
json_data = open(file_path) | |
data = json.load(json_data) | |
return data | |
def get_desc(file_path): | |
data = read_json(file_path) | |
description = data['extendedInfo']['description'] | |
return description.encode('ascii', errors='ignore') # FIXME: workaround | |
def get_desc_from_folder(folder_path, desc_count=1000): | |
name_desc_pairs = {} | |
count = desc_count | |
for root, dirs, files in os.walk(folder_path): | |
for file in files: | |
if file.endswith(".json"): | |
if len(name_desc_pairs) < count: | |
# FIXME: unicode error \xc3\xa2\xc2\x99\xc2\xa3 | |
desc = get_desc(os.path.join(root, file)) | |
desc_utf8 = desc.encode('utf-8') | |
if len(desc) > 1000: | |
lang = cld.detect(desc_utf8) | |
if lang[1] == 'en' and len(lang[4]) == 1: | |
name_desc_pairs[file] = desc | |
return name_desc_pairs | |
folder_path = "data_google_play" | |
if not os.path.exists(folder_path): | |
call(["git", "clone", "https://github.com/sangheestyle/data_google_play.git"]) | |
name_desc_pairs = get_desc_from_folder(folder_path) | |
documents = name_desc_pairs.values() | |
from nltk.tokenize import RegexpTokenizer | |
from nltk.corpus import stopwords | |
from nltk.stem.lancaster import LancasterStemmer | |
from nltk.cluster import KMeansClusterer, euclidean_distance | |
from gensim import corpora, models, utils | |
from numpy import array | |
# Step 1: | |
# override LdaModel for changing output format | |
# from [(topicid, topicvalue)] to [topicvalue] due to format of KMeansClusterer | |
class MyLdaModel(models.ldamodel.LdaModel): | |
def __getitem__(self, bow, eps=0.01): | |
is_corpus, corpus = utils.is_corpus(bow) | |
if is_corpus: | |
return self._apply(corpus) | |
gamma, _ = self.inference([bow]) | |
topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution | |
return [topicvalue for topicid, topicvalue in enumerate(topic_dist)] | |
# FIXME: if topicvalue >= eps] | |
# Step 2: removing whitespaces, punctuations, stopwords, and stemming words | |
processed = [] | |
for document in documents: | |
tokenizer = RegexpTokenizer(r'\w+') | |
intermediate = tokenizer.tokenize(document) | |
stop = stopwords.words('english') | |
intermediate = [i for i in intermediate if i not in stop] | |
# FIXME: using other stemmers also to know quality of each stemmed text | |
lanste = LancasterStemmer() | |
intermediate = [lanste.stem(i) for i in intermediate] | |
processed.append(intermediate) | |
# Step 3 | |
# making dictionary and corpus | |
dictionary = corpora.Dictionary(processed) | |
#dictionary.save('/tmp/dict.dict') | |
corpus = [dictionary.doc2bow(description) for description in processed] | |
#corpora.MmCorpus.serialize('/tmp/temp.mm', corpus) | |
# Step 4: LDA | |
num_topics = 5 | |
model_lda = MyLdaModel(corpus, id2word=dictionary, num_topics=num_topics) | |
doc_lda = model_lda[corpus] | |
''' | |
for doc in doc_lda: | |
print doc | |
''' | |
# Step 5: k-means clustering | |
vectors = [array(f) for f in doc_lda] | |
clusterer = KMeansClusterer(num_topics, euclidean_distance, repeats=100, avoid_empty_clusters=True) | |
clusterer.cluster(vectors, True) | |
apps_per_topic = [] | |
for x in range(num_topics): | |
apps_per_topic.append([]) | |
# classify a new vector | |
apk_names = name_desc_pairs.keys() | |
for i, doc in enumerate(doc_lda): | |
topic_id = clusterer.classify(array(doc)) | |
apps_per_topic[topic_id].append(apk_names[i]) | |
# Step 6: make text for each topic | |
text_for_topics = [] | |
for x in range(num_topics): | |
text_for_topics.append('') | |
apkname_stem_pairs = dict(zip(name_desc_pairs.keys(), processed)) | |
for topic_id, names in enumerate(apps_per_topic): | |
for name in names: | |
# FIXME: there have two options for word cloud 1) pure descriptions 2) using stem processed | |
# text_for_topics[topic_id] = text_for_topics[topic_id] + " " + name_desc_pairs[name] | |
text = " ".join(apkname_stem_pairs[name]) | |
text_for_topics[topic_id] = text_for_topics[topic_id] + text | |
output_path = "out" | |
if os.path.exists(output_path): | |
shutil.rmtree(output_path) | |
os.mkdir(output_path) | |
for topic_id, text_for_topic in enumerate(text_for_topics): | |
file_name = "topic-" + str(topic_id) + ".txt" | |
text_file = open(os.path.join(output_path, file_name), "w") | |
text_file.write(text_for_topic) | |
text_file.close() | |
# Step 7: word cloud - TBD | |
# FIXME: need to implement or just using http://www.wordle.net temporary |
Result: 3 topics and 10 sentences (3 X 10)
[(0, 0.043487224655015692), (1, 0.91334320179268014), (2, 0.043169573552304011)]
[(0, 0.038937039435168998), (1, 0.041896434430366933), (2, 0.91916652613446403)]
[(0, 0.053246879732978467), (1, 0.050838987090679848), (2, 0.89591413317634161)]
[(0, 0.90083829551970018), (1, 0.048699129697301391), (2, 0.050462574782998426)]
[(0, 0.042326027355327193), (1, 0.91310314616436017), (2, 0.044570826480312589)]
[(0, 0.048260934238125504), (1, 0.048299573968107745), (2, 0.90343949179376681)]
[(0, 0.056756398955460384), (1, 0.057671474088390323), (2, 0.88557212695614929)]
[(0, 0.038675284258316643), (1, 0.91980941304997033), (2, 0.041515302691713184)]
[(0, 0.06859140004108788), (1, 0.072816298081204517), (2, 0.85859230187770763)]
[(0, 0.042202610067397242), (1, 0.042203849123989244), (2, 0.91559354080861355)]
The next step is applying k-mean clustering and visualization.
Result: cluster id for each sentence. For example 2nd and 5th sentence are same cluster.
1
0
2
1
0
2
1
1
1
2
The next step is applying visualization.
Reference till now:
www.nltk.org (nltk)
https://github.com/nltk/nltk (nltk)
http://radimrehurek.com/gensim (gensim)
https://github.com/piskvorky/gensim (gensim)
http://www.wikipedia.org (terms)
Natural Language Processing with Python (book for nltk)
Data Mining - Practical Machine Learning Tools (book for NLP and Weka)
Review revision 6:
It took so long time to fix "unicode" error with workaround. Also, I need to check line 55, 56 to know what is eps.
For generating word cloud, I will use pytagcloud.
https://pypi.python.org/pypi/pytagcloud
Upload 1365 json files including each apk's information from google play to github. Each file includes more than 1000 charecters of description.
Revisions 8:
- remove some package importings
Revison 9:
- download sample json files from github for testing (about 1000 json files)
- generate text files including every descripton for same topic (e.g. out/topic-0.txt)
how to apply this on an excel file, plz I'm blocked
Here is the 2nd version for LDA with gensim.