-
-
Save bowbowbow/16d1ec6778a35626867569cd1c285940 to your computer and use it in GitHub Desktop.
| import json, re, datetime | |
| import pandas as pd | |
| import numpy as np | |
| import nltk | |
| from nltk.stem.snowball import SnowballStemmer | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| stemmer = SnowballStemmer("english") | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import pairwise_distances_argmin_min, pairwise | |
| def data_load(): | |
| frames = [] | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
| data = json.load(f) | |
| data['year'] = dict() | |
| data['timestamp'] = dict() | |
| for doc_id in data[' body']: | |
| time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S') | |
| data['year'][doc_id] = time.year | |
| data['timestamp'][doc_id] = (time.month * 30 + time.day) / 100.0 | |
| df = pd.DataFrame.from_dict(data) | |
| # header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
| # print(df.columns.values) | |
| frames.append(df) | |
| return pd.concat(frames) | |
| def chunking(text): | |
| chunks = [] | |
| parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| # words = [word for word in words if word not in stopwords] | |
| tags = nltk.pos_tag(words) | |
| tree = parser.parse(tags) | |
| leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
| for leave in leaves: | |
| chunk = [word[0] for word in leave] | |
| chunks.append('_'.join(chunk)) | |
| return chunks | |
| def tokenize_and_stem(text): | |
| stems = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| words = [word for word in words if word not in stopwords] | |
| for word in words: stems.append(stemmer.stem(word)) | |
| return stems | |
| def clustering(year): | |
| df = data_load() | |
| df = df[df.year == year] | |
| # print(df[:5].to_string()) | |
| encoder = LabelEncoder() | |
| encoder.fit(df[' author']) | |
| df[' author'] = encoder.transform(df[' author']) | |
| encoder.fit(df[' section']) | |
| df[' section'] = encoder.transform(df[' section']) | |
| class TextSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[self.key] | |
| class NumberSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[[self.key]] | |
| vectorizer = FeatureUnion( | |
| transformer_list=[ | |
| ('title', Pipeline([ | |
| ('selector', TextSelector(key='title')), | |
| ('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('body', Pipeline([ | |
| ('selector', TextSelector(key=' body')), | |
| ('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('author', Pipeline([ | |
| ('selector', NumberSelector(key=' author')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('section', Pipeline([ | |
| ('selector', NumberSelector(key=' section')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('timestamp', Pipeline([ | |
| ('selector', NumberSelector(key='timestamp')), | |
| ])), | |
| ], | |
| # weight components in FeatureUnion | |
| transformer_weights={ | |
| 'section': 3.0, | |
| 'title': 1.0, | |
| 'body': 1.0, | |
| 'timestamp': 0.5, | |
| 'author': 0.3, | |
| }, | |
| ) | |
| X = vectorizer.fit_transform(df) | |
| true_k = 10 | |
| from sklearn.cluster import KMeans | |
| model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
| model.fit(X) | |
| df['cluster'] = model.labels_ | |
| closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X) | |
| print('closest :', closest) | |
| for c in range(true_k): | |
| print('cluster {}'.format(c)) | |
| dis = model.transform(X)[:, c] | |
| dis = [(i, dis[i]) for i in range(len(dis))] | |
| dis = sorted(dis, key=lambda x: x[1]) | |
| for item in dis[:5]: | |
| doc_id = item[0] | |
| print(doc_id, ', title :', df.iloc[doc_id]['title']) | |
| # print(chunking(doc[' body'])) | |
| if __name__ == "__main__": | |
| clustering(year=2017) |
cluster 0
6108 , title : [Graphic News] Salvage operation procedure
6219 , title : [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
6603 , title : [Graphic News] Four possible scenarios
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
4876 , title : Gwanghwamun rises as center of democracy
cluster 1
3829 , title : NK says int'l sanctions hinder delivery of aid to Pyongyang
3068 , title : No evidence of money from Kaesong complex being used for NK nukes: Seoul official
5704 , title : Growing signs of NK missile test deal blow to engagement approach
5924 , title : Seoul views inter-Korean dialogue differently from NK nuke talks: official
3605 , title : Coordination over Seoul, Washington's NK policy key task for allies
cluster 2
5145 , title : [Graphic News] Election in numbers
3970 , title : [News Focus] Moon's extra budget speech fails to win over opposition
3930 , title : Opposition cries foul on Kim appointment
8563 , title : Legal battles crowd presidential race
4813 , title : [Newsmaker] Moon Jae-in names special envoys, adds Europe to list
cluster 3
2040 , title : Talk of tactical nuclear weapons resurfaces
3866 , title : Allies to conduct military drills similar to last year‘s: military
5344 , title : Moon Jae-in urges ‘complete’ overhaul of military
3422 , title : S. Korea, US begin military drills amid N. Korea's threats
3789 , title : JCS chief nominee says no plan to scale back Korea-US military drill
cluster 4
6656 , title : China's top diplomat renews strong objection to THAAD deployment
8474 , title : Hwang urges Japan to show sincere remorse over shared history
8546 , title : [Newsmaker] Chinese envoy’s anti-THAAD campaign double-edged sword
8785 , title : Recalled Japanese envoy to return to Seoul
8601 , title : Japan envoy meets vice foreign minister amid girl statue controversy
cluster 5
3349 , title : Moon, Xi likely to meet in Germany this week to discuss THAAD controversy
5092 , title : Daunting diplomatic challenges with Trump, China await next S.Korean president
3397 , title : Moon's efforts for inter-Korean ties to gain pace with Trump's support
3350 , title : Russia hopes Korean president to visit the country in Sept.
5070 , title : THAAD deployment could raise cost-sharing questions: CRS report
cluster 6
6334 , title : Tillerson to visit DMZ as part of Korean tour
8940 , title : Russian envoy calls for restraint amid tensions on Korean Peninsula
8871 , title : Trump to host China's Xi at Florida resort next week
8703 , title : Secondary sanctions 'early topic' for Trump-Xi summit: senior White House official
4929 , title : Reopening of Kaesong complex requires UN sanctions-related review: official
cluster 7
2456 , title : [Graphic News] A day in Seoul in numbers
555 , title : Top 10 national news stories
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
663 , title : [Frome the Scene] Pilot ‘right to die’ program gets warm reception
1402 , title : [Feature] Gosiwon, modern time refuge for house poor
cluster 8
5440 , title : N. Korea may carry out more provocations despite UN resolutions: experts
5388 , title : N. Korea threatens 'physical' actions over new UN sanctions
3032 , title : N. Korea says another UNSC resolution will trigger corresponding measures
5959 , title : Japan says this is a time to raise pressure on North Korea
5315 , title : Trump, North Korea trade escalating threats of fire
cluster 9
259 , title : Former sex slave, ‘Dokdo Shrimp’ at Trump dinner spark new Korea-Japan row
162 , title : Korean FM to visit China to discuss preparations for summit
2314 , title : S. Korean lawmaker raises expectations that Moon, Abe will resume 'shuttle diplomacy'
130 , title : Korean, Philippine leaders agree to improve ties, better protect nationals
2503 , title : Turkish PM promises to support Korean companies as if they're domestic firms
cluster 0
title: Hwang urges Japan to show sincere remorse over shared history
cluster 1
title: S. Korea likely to propose talks to NK following Moon's peace gesture
cluster 2
title: Authorities deny allegation of mistreatment of jailed ex-president
cluster 3
title: Conservatives jostle for position to challenge frontrunner Moon
cluster 4
title: [Graphic News] Salvage operation procedure
cluster 5
title: Ruling party renews calls for main opposition to lift parliamentary boycott
cluster 6
title: Acting president renews vow to carry out THAAD deployment
cluster 7
title: Russian envoy calls for restraint amid tensions on Korean Peninsula
cluster 8
title: Allies to conduct military drills similar to last year‘s: military
cluster 9
title: PyeongChang Games may help mend soured Sino-Korean ties: experts