Created
December 1, 2018 08:34
-
-
Save bowbowbow/d1ad799e74448b1999033dd22ce3b189 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json, re, datetime | |
| import pandas as pd | |
| import nltk | |
| from nltk.stem.snowball import SnowballStemmer | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| stemmer = SnowballStemmer("english") | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| def data_load(): | |
| frames = [] | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
| data = json.load(f) | |
| df = pd.DataFrame.from_dict(data) | |
| # header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
| # print(df.columns.values) | |
| frames.append(df) | |
| return pd.concat(frames) | |
| def chunking(text): | |
| chunks = [] | |
| parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| # words = [word for word in words if word not in stopwords] | |
| tags = nltk.pos_tag(words) | |
| tree = parser.parse(tags) | |
| leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
| for leave in leaves: | |
| chunks.append([word[0] for word in leave]) | |
| return chunks | |
| def tokenize_and_stem(text): | |
| stems = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| words = [word for word in words if word not in stopwords] | |
| for word in words: stems.append(stemmer.stem(word)) | |
| return stems | |
| def clustering(year): | |
| df = data_load() | |
| print(df.head()) | |
| documents = [] | |
| y = [] | |
| for index, row in df.iterrows(): | |
| time = datetime.datetime.strptime(row[' time'], '%Y-%m-%d %H:%M:%S') | |
| if year != time.year: | |
| continue | |
| title = row['title'] | |
| documents.append(title) | |
| y.append(index) | |
| author_encoder = LabelEncoder() | |
| author_encoder.fit(df[' author']) | |
| df[' author'] = author_encoder.transform(df[' author']) | |
| class TextSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[self.key] | |
| class NumberSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[[self.key]] | |
| vectorizer = FeatureUnion( | |
| transformer_list=[ | |
| ('title', Pipeline([ | |
| ('selector', TextSelector(key='title')), | |
| ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| # ('author', Pipeline([ | |
| # ('selector', NumberSelector(key=' author')), | |
| # ('onehot', OneHotEncoder()) | |
| # ])), | |
| ], | |
| # weight components in FeatureUnion | |
| transformer_weights={ | |
| 'title': 0.7, | |
| # 'author': 0.3, | |
| }, | |
| ) | |
| X = vectorizer.fit_transform(df) | |
| print(vectorizer.transformer_list[0][1].named_steps['tfidf'].get_feature_names()) | |
| true_k = 10 | |
| from sklearn.cluster import KMeans | |
| model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
| model.fit(X) | |
| print("Top terms per cluster:") | |
| order_centroids = model.cluster_centers_.argsort()[:, ::-1] | |
| print('order_centroids : ', order_centroids) | |
| # terms = vectorizer.get_feature_names() | |
| terms = vectorizer.transformer_list[0][1].named_steps['tfidf'].get_feature_names() | |
| for i in range(true_k): | |
| keywords = [terms[ind] for ind in order_centroids[i, :10]] | |
| print('Cluster {}: {}'.format(i, ','.join(keywords))) | |
| if __name__ == "__main__": | |
| clustering(year=2017) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
2017
Cluster 0: nk,moon,chief,court,minist,new,us,arrest,presid,report
Cluster 1: parti,rule,rule parti,opposit parti,opposit,leader,lawmak,parti chief,parti leader,peopl parti
Cluster 2: call,park call,park,korea,call korea,nk,korea call,moon call,us,chief call
Cluster 3: seoul,pyongyang,citi,seoul citi,nk,nuke,seoul washington,washington,seoul mayor,tokyo
Cluster 4: park,impeach,aid,park approv,rate,park impeach,scandal,approv,presid park,approv rate
Cluster 5: korea,us,china,japan,missil,south korea,south,nuclear,report,militari
Cluster 6: sex,slaveri,sex slaveri,sex slave,slave,wartim,japan,sexual slaveri,sexual,victim
Cluster 7: korean,south korean,china,south,korea,korean leader,leader,report,militari,peninsula
Cluster 8: talk,hold,korea,hold talk,korea hold,talk korea,japan,china,japan hold,korea japan
Cluster 9: north,north korea,korea,north korean,korean,test,south,nuclear,missil,south korea
2016
Cluster 0: elect,presidenti,presidenti offic,offic,candid,presidenti elect,bid,presidenti bid,moon,presidenti hope
Cluster 1: sex,slaveri,sex slaveri,sex slave,slave,wartim,japan,sexual slaveri,sexual,victim
Cluster 2: arrest,alleg,polic,prosecutor,scandal,briberi,probe,chief,warrant,raid
Cluster 3: korean,nk,moon,us,new,call,minist,leader,china,report
Cluster 4: parti,rule,rule parti,opposit parti,opposit,leader,rival parti,rival,lawmak,parti leader
Cluster 5: park,park call,call,impeach,park approv,rate,approv,scandal,approv rate,aid
Cluster 6: korea,north,us,north korea,china,talk,japan,south,nuclear,south korea
Cluster 7: court,top court,court uphold,constitut,uphold,top,constitut court,court rule,rule,court order
Cluster 8: seoul,pyongyang,citi,seoul citi,talk,seoul washington,washington,tokyo,nk,nuke
Cluster 9: missil,missil launch,launch,korea,nk,missil test,test,nk missil,korea missil,ballist missil
2015
Cluster 0: moon,chief,court,arrest,minist,new,presid,alleg,probe,presidenti
Cluster 1: korea,north,north korea,china,south,south korea,japan,nuclear,report,new
Cluster 2: talk,hold,korea,hold talk,korea hold,talk korea,japan,china,korea japan,japan hold
Cluster 3: korean,north korean,north,south korean,south,korean leader,china,leader,korea,report
Cluster 4: us,korea,korea us,nk,us japan,militari,sanction,japan,drill,us expert
Cluster 5: vow,human right,human,right,korea,park vow,korea vow,effort,moon vow,nk
Cluster 6: park,park call,call,impeach,aid,park approv,park impeach,rate,scandal,approv
Cluster 7: seoul,pyongyang,citi,seoul citi,seoul washington,washington,nk,tokyo,seoul mayor,nuke
Cluster 8: nk,missil,launch,missil launch,korea,test,missil test,nk missil,nuke,nk leader
Cluster 9: parti,rule,rule parti,opposit parti,opposit,leader,parti chief,lawmak,rival parti,rival