Last active
November 14, 2018 07:04
-
-
Save bowbowbow/b8c15cf8afd24455963c466312d92a4b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| import nltk | |
| import re, json, os, codecs, mpld3, datetime | |
| from sklearn import feature_extraction | |
| from sklearn.externals import joblib | |
| def data_load(year): | |
| data = list() | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i)) as f: | |
| doc = json.load(f) | |
| # df = pd.DataFrame.from_dict(doc) | |
| # print(df) | |
| item = dict() | |
| keys = doc.keys() | |
| for article_id in doc[' body']: | |
| time = datetime.datetime.strptime(doc[' time'][article_id], '%Y-%m-%d %H:%M:%S') | |
| if year != time.year: | |
| continue | |
| for key in keys: | |
| ckey = key.replace(' ', '') | |
| item[ckey] = doc[key][article_id] | |
| item['article_id'] = article_id | |
| data.append(item) | |
| print('[data_load] finish') | |
| return data | |
| def clustering(): | |
| data = data_load(year=2017) | |
| titles = [item['title'] for item in data] | |
| sections = [item['section'] for item in data] | |
| model_path = './doc_cluster.pkl' | |
| num_clusters = 10 | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| from nltk.stem.snowball import SnowballStemmer | |
| stemmer = SnowballStemmer("english") | |
| def tokenize_and_stem(text): | |
| tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] | |
| filtered_tokens = [] | |
| # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) | |
| for token in tokens: | |
| if re.search('[a-zA-Z]', token): | |
| filtered_tokens.append(token) | |
| stems = [stemmer.stem(t) for t in filtered_tokens] | |
| return stems | |
| def tokenize_only(text): | |
| tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] | |
| filtered_tokens = [] | |
| # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) | |
| for token in tokens: | |
| if re.search('[a-zA-Z]', token): | |
| filtered_tokens.append(token) | |
| return filtered_tokens | |
| totalvocab_stemmed = [] | |
| totalvocab_tokenized = [] | |
| for item in data: | |
| allwords_stemmed = tokenize_and_stem(item['title']) | |
| totalvocab_stemmed.extend(allwords_stemmed) | |
| allwords_tokenized = tokenize_only(item['title']) | |
| totalvocab_tokenized.extend(allwords_tokenized) | |
| vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed) | |
| print('[clustering] there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame') | |
| print(vocab_frame.head()) | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # define vectorizer parameters | |
| tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000, | |
| min_df=0.1, stop_words='english', | |
| use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(titles) # fit the vectorizer to titles | |
| print('[clustering] tfidf_matrix.shape : ', tfidf_matrix.shape) | |
| terms = tfidf_vectorizer.get_feature_names() | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| dist = 1 - cosine_similarity(tfidf_matrix) | |
| if os.path.isfile(model_path): | |
| print('[clustering] load model..') | |
| km = joblib.load(model_path) | |
| else: | |
| print('[clustering] build model..') | |
| from sklearn.cluster import KMeans | |
| km = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) | |
| km.fit(tfidf_matrix) | |
| joblib.dump(km, 'doc_cluster.pkl') | |
| clusters = km.labels_.tolist() | |
| films = {'title': titles, 'cluster': clusters, 'section': sections} | |
| frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster', 'section']) | |
| print('[clustering] number of articles per cluster : ', frame['cluster'].value_counts()) | |
| print("[clustering] top terms per cluster:") | |
| order_centroids = km.cluster_centers_.argsort()[:, ::-1] | |
| for i in range(num_clusters): | |
| print("[clustering] cluster %d words:" % i, end='') | |
| for ind in order_centroids[i, :6]: # replace 6 with n words per cluster | |
| print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') | |
| print('') | |
| # print("Cluster %d titles:" % i, end='') | |
| # for title in frame.ix[i]['title'].values.tolist(): | |
| # print(' %s,' % title, end='') | |
| # print() # add whitespace | |
| if __name__ == "__main__": | |
| # print(data_load()[0]) | |
| clustering() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
[clustering] top terms per cluster:
[clustering] cluster 0 words: b'opposition', b'opposition', b'opposition', b'rejects', b'rejects', b'request',
[clustering] cluster 1 words: b'samsung', b'ranks', b"'s", b"'s", b'11th', b'arrest',
[clustering] cluster 2 words: b'military', b'condemns', b'korea', b'korea', b'land', b'military',