Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Last active November 14, 2018 07:04
Show Gist options
  • Select an option

  • Save bowbowbow/b8c15cf8afd24455963c466312d92a4b to your computer and use it in GitHub Desktop.

Select an option

Save bowbowbow/b8c15cf8afd24455963c466312d92a4b to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import nltk
import re, json, os, codecs, mpld3, datetime
from sklearn import feature_extraction
from sklearn.externals import joblib
def data_load(year):
data = list()
for i in range(0, 8):
with open('./data/koreaherald_1517_{}.json'.format(i)) as f:
doc = json.load(f)
# df = pd.DataFrame.from_dict(doc)
# print(df)
item = dict()
keys = doc.keys()
for article_id in doc[' body']:
time = datetime.datetime.strptime(doc[' time'][article_id], '%Y-%m-%d %H:%M:%S')
if year != time.year:
continue
for key in keys:
ckey = key.replace(' ', '')
item[ckey] = doc[key][article_id]
item['article_id'] = article_id
data.append(item)
print('[data_load] finish')
return data
def clustering():
data = data_load(year=2017)
titles = [item['title'] for item in data]
sections = [item['section'] for item in data]
model_path = './doc_cluster.pkl'
num_clusters = 10
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for item in data:
allwords_stemmed = tokenize_and_stem(item['title'])
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(item['title'])
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
print('[clustering] there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
min_df=0.1, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(titles) # fit the vectorizer to titles
print('[clustering] tfidf_matrix.shape : ', tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
if os.path.isfile(model_path):
print('[clustering] load model..')
km = joblib.load(model_path)
else:
print('[clustering] build model..')
from sklearn.cluster import KMeans
km = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
km.fit(tfidf_matrix)
joblib.dump(km, 'doc_cluster.pkl')
clusters = km.labels_.tolist()
films = {'title': titles, 'cluster': clusters, 'section': sections}
frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster', 'section'])
print('[clustering] number of articles per cluster : ', frame['cluster'].value_counts())
print("[clustering] top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
print("[clustering] cluster %d words:" % i, end='')
for ind in order_centroids[i, :6]: # replace 6 with n words per cluster
print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
print('')
# print("Cluster %d titles:" % i, end='')
# for title in frame.ix[i]['title'].values.tolist():
# print(' %s,' % title, end='')
# print() # add whitespace
if __name__ == "__main__":
# print(data_load()[0])
clustering()
@bowbowbow
Copy link
Author

[clustering] top terms per cluster:
[clustering] cluster 0 words: b'opposition', b'opposition', b'opposition', b'rejects', b'rejects', b'request',
[clustering] cluster 1 words: b'samsung', b'ranks', b"'s", b"'s", b'11th', b'arrest',
[clustering] cluster 2 words: b'military', b'condemns', b'korea', b'korea', b'land', b'military',

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment