Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Created December 1, 2018 08:34
Show Gist options
  • Select an option

  • Save bowbowbow/d1ad799e74448b1999033dd22ce3b189 to your computer and use it in GitHub Desktop.

Select an option

Save bowbowbow/d1ad799e74448b1999033dd22ce3b189 to your computer and use it in GitHub Desktop.
import json, re, datetime
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
def data_load():
frames = []
for i in range(0, 8):
with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f:
data = json.load(f)
df = pd.DataFrame.from_dict(data)
# header: [' author' ' body' ' description' ' section' ' time' 'title']
# print(df.columns.values)
frames.append(df)
return pd.concat(frames)
def chunking(text):
chunks = []
parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}")
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
# words = [word for word in words if word not in stopwords]
tags = nltk.pos_tag(words)
tree = parser.parse(tags)
leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"]
for leave in leaves:
chunks.append([word[0] for word in leave])
return chunks
def tokenize_and_stem(text):
stems = []
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
words = [word for word in words if word not in stopwords]
for word in words: stems.append(stemmer.stem(word))
return stems
def clustering(year):
df = data_load()
print(df.head())
documents = []
y = []
for index, row in df.iterrows():
time = datetime.datetime.strptime(row[' time'], '%Y-%m-%d %H:%M:%S')
if year != time.year:
continue
title = row['title']
documents.append(title)
y.append(index)
author_encoder = LabelEncoder()
author_encoder.fit(df[' author'])
df[' author'] = author_encoder.transform(df[' author'])
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[[self.key]]
vectorizer = FeatureUnion(
transformer_list=[
('title', Pipeline([
('selector', TextSelector(key='title')),
('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
# ('author', Pipeline([
# ('selector', NumberSelector(key=' author')),
# ('onehot', OneHotEncoder())
# ])),
],
# weight components in FeatureUnion
transformer_weights={
'title': 0.7,
# 'author': 0.3,
},
)
X = vectorizer.fit_transform(df)
print(vectorizer.transformer_list[0][1].named_steps['tfidf'].get_feature_names())
true_k = 10
from sklearn.cluster import KMeans
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
print('order_centroids : ', order_centroids)
# terms = vectorizer.get_feature_names()
terms = vectorizer.transformer_list[0][1].named_steps['tfidf'].get_feature_names()
for i in range(true_k):
keywords = [terms[ind] for ind in order_centroids[i, :10]]
print('Cluster {}: {}'.format(i, ','.join(keywords)))
if __name__ == "__main__":
clustering(year=2017)
@bowbowbow
Copy link
Author

bowbowbow commented Dec 1, 2018

2017

Cluster 0: nk,moon,chief,court,minist,new,us,arrest,presid,report
Cluster 1: parti,rule,rule parti,opposit parti,opposit,leader,lawmak,parti chief,parti leader,peopl parti
Cluster 2: call,park call,park,korea,call korea,nk,korea call,moon call,us,chief call
Cluster 3: seoul,pyongyang,citi,seoul citi,nk,nuke,seoul washington,washington,seoul mayor,tokyo
Cluster 4: park,impeach,aid,park approv,rate,park impeach,scandal,approv,presid park,approv rate
Cluster 5: korea,us,china,japan,missil,south korea,south,nuclear,report,militari
Cluster 6: sex,slaveri,sex slaveri,sex slave,slave,wartim,japan,sexual slaveri,sexual,victim
Cluster 7: korean,south korean,china,south,korea,korean leader,leader,report,militari,peninsula
Cluster 8: talk,hold,korea,hold talk,korea hold,talk korea,japan,china,japan hold,korea japan
Cluster 9: north,north korea,korea,north korean,korean,test,south,nuclear,missil,south korea

2016

Cluster 0: elect,presidenti,presidenti offic,offic,candid,presidenti elect,bid,presidenti bid,moon,presidenti hope
Cluster 1: sex,slaveri,sex slaveri,sex slave,slave,wartim,japan,sexual slaveri,sexual,victim
Cluster 2: arrest,alleg,polic,prosecutor,scandal,briberi,probe,chief,warrant,raid
Cluster 3: korean,nk,moon,us,new,call,minist,leader,china,report
Cluster 4: parti,rule,rule parti,opposit parti,opposit,leader,rival parti,rival,lawmak,parti leader
Cluster 5: park,park call,call,impeach,park approv,rate,approv,scandal,approv rate,aid
Cluster 6: korea,north,us,north korea,china,talk,japan,south,nuclear,south korea
Cluster 7: court,top court,court uphold,constitut,uphold,top,constitut court,court rule,rule,court order
Cluster 8: seoul,pyongyang,citi,seoul citi,talk,seoul washington,washington,tokyo,nk,nuke
Cluster 9: missil,missil launch,launch,korea,nk,missil test,test,nk missil,korea missil,ballist missil

2015

Cluster 0: moon,chief,court,arrest,minist,new,presid,alleg,probe,presidenti
Cluster 1: korea,north,north korea,china,south,south korea,japan,nuclear,report,new
Cluster 2: talk,hold,korea,hold talk,korea hold,talk korea,japan,china,korea japan,japan hold
Cluster 3: korean,north korean,north,south korean,south,korean leader,china,leader,korea,report
Cluster 4: us,korea,korea us,nk,us japan,militari,sanction,japan,drill,us expert
Cluster 5: vow,human right,human,right,korea,park vow,korea vow,effort,moon vow,nk
Cluster 6: park,park call,call,impeach,aid,park approv,park impeach,rate,scandal,approv
Cluster 7: seoul,pyongyang,citi,seoul citi,seoul washington,washington,nk,tokyo,seoul mayor,nuke
Cluster 8: nk,missil,launch,missil launch,korea,test,missil test,nk missil,nuke,nk leader
Cluster 9: parti,rule,rule parti,opposit parti,opposit,leader,parti chief,lawmak,rival parti,rival

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment