Last active
December 2, 2018 13:57
-
-
Save bowbowbow/16d1ec6778a35626867569cd1c285940 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json, re, datetime | |
| import pandas as pd | |
| import numpy as np | |
| import nltk | |
| from nltk.stem.snowball import SnowballStemmer | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| stemmer = SnowballStemmer("english") | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import pairwise_distances_argmin_min, pairwise | |
| def data_load(): | |
| frames = [] | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
| data = json.load(f) | |
| data['year'] = dict() | |
| data['timestamp'] = dict() | |
| for doc_id in data[' body']: | |
| time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S') | |
| data['year'][doc_id] = time.year | |
| data['timestamp'][doc_id] = (time.month * 30 + time.day) / 100.0 | |
| df = pd.DataFrame.from_dict(data) | |
| # header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
| # print(df.columns.values) | |
| frames.append(df) | |
| return pd.concat(frames) | |
| def chunking(text): | |
| chunks = [] | |
| parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| # words = [word for word in words if word not in stopwords] | |
| tags = nltk.pos_tag(words) | |
| tree = parser.parse(tags) | |
| leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
| for leave in leaves: | |
| chunk = [word[0] for word in leave] | |
| chunks.append('_'.join(chunk)) | |
| return chunks | |
| def tokenize_and_stem(text): | |
| stems = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| words = [word for word in words if word not in stopwords] | |
| for word in words: stems.append(stemmer.stem(word)) | |
| return stems | |
| def clustering(year): | |
| df = data_load() | |
| df = df[df.year == year] | |
| # print(df[:5].to_string()) | |
| encoder = LabelEncoder() | |
| encoder.fit(df[' author']) | |
| df[' author'] = encoder.transform(df[' author']) | |
| encoder.fit(df[' section']) | |
| df[' section'] = encoder.transform(df[' section']) | |
| class TextSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[self.key] | |
| class NumberSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[[self.key]] | |
| vectorizer = FeatureUnion( | |
| transformer_list=[ | |
| ('title', Pipeline([ | |
| ('selector', TextSelector(key='title')), | |
| ('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('body', Pipeline([ | |
| ('selector', TextSelector(key=' body')), | |
| ('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('author', Pipeline([ | |
| ('selector', NumberSelector(key=' author')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('section', Pipeline([ | |
| ('selector', NumberSelector(key=' section')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('timestamp', Pipeline([ | |
| ('selector', NumberSelector(key='timestamp')), | |
| ])), | |
| ], | |
| # weight components in FeatureUnion | |
| transformer_weights={ | |
| 'section': 3.0, | |
| 'title': 1.0, | |
| 'body': 1.0, | |
| 'timestamp': 0.5, | |
| 'author': 0.3, | |
| }, | |
| ) | |
| X = vectorizer.fit_transform(df) | |
| true_k = 10 | |
| from sklearn.cluster import KMeans | |
| model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
| model.fit(X) | |
| df['cluster'] = model.labels_ | |
| closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X) | |
| print('closest :', closest) | |
| for c in range(true_k): | |
| print('cluster {}'.format(c)) | |
| dis = model.transform(X)[:, c] | |
| dis = [(i, dis[i]) for i in range(len(dis))] | |
| dis = sorted(dis, key=lambda x: x[1]) | |
| for item in dis[:5]: | |
| doc_id = item[0] | |
| print(doc_id, ', title :', df.iloc[doc_id]['title']) | |
| # print(chunking(doc[' body'])) | |
| if __name__ == "__main__": | |
| clustering(year=2017) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cluster 0
6108 , title : [Graphic News] Salvage operation procedure
6219 , title : [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
6603 , title : [Graphic News] Four possible scenarios
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
4876 , title : Gwanghwamun rises as center of democracy
cluster 1
3829 , title : NK says int'l sanctions hinder delivery of aid to Pyongyang
3068 , title : No evidence of money from Kaesong complex being used for NK nukes: Seoul official
5704 , title : Growing signs of NK missile test deal blow to engagement approach
5924 , title : Seoul views inter-Korean dialogue differently from NK nuke talks: official
3605 , title : Coordination over Seoul, Washington's NK policy key task for allies
cluster 2
5145 , title : [Graphic News] Election in numbers
3970 , title : [News Focus] Moon's extra budget speech fails to win over opposition
3930 , title : Opposition cries foul on Kim appointment
8563 , title : Legal battles crowd presidential race
4813 , title : [Newsmaker] Moon Jae-in names special envoys, adds Europe to list
cluster 3
2040 , title : Talk of tactical nuclear weapons resurfaces
3866 , title : Allies to conduct military drills similar to last year‘s: military
5344 , title : Moon Jae-in urges ‘complete’ overhaul of military
3422 , title : S. Korea, US begin military drills amid N. Korea's threats
3789 , title : JCS chief nominee says no plan to scale back Korea-US military drill
cluster 4
6656 , title : China's top diplomat renews strong objection to THAAD deployment
8474 , title : Hwang urges Japan to show sincere remorse over shared history
8546 , title : [Newsmaker] Chinese envoy’s anti-THAAD campaign double-edged sword
8785 , title : Recalled Japanese envoy to return to Seoul
8601 , title : Japan envoy meets vice foreign minister amid girl statue controversy
cluster 5
3349 , title : Moon, Xi likely to meet in Germany this week to discuss THAAD controversy
5092 , title : Daunting diplomatic challenges with Trump, China await next S.Korean president
3397 , title : Moon's efforts for inter-Korean ties to gain pace with Trump's support
3350 , title : Russia hopes Korean president to visit the country in Sept.
5070 , title : THAAD deployment could raise cost-sharing questions: CRS report
cluster 6
6334 , title : Tillerson to visit DMZ as part of Korean tour
8940 , title : Russian envoy calls for restraint amid tensions on Korean Peninsula
8871 , title : Trump to host China's Xi at Florida resort next week
8703 , title : Secondary sanctions 'early topic' for Trump-Xi summit: senior White House official
4929 , title : Reopening of Kaesong complex requires UN sanctions-related review: official
cluster 7
2456 , title : [Graphic News] A day in Seoul in numbers
555 , title : Top 10 national news stories
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
663 , title : [Frome the Scene] Pilot ‘right to die’ program gets warm reception
1402 , title : [Feature] Gosiwon, modern time refuge for house poor
cluster 8
5440 , title : N. Korea may carry out more provocations despite UN resolutions: experts
5388 , title : N. Korea threatens 'physical' actions over new UN sanctions
3032 , title : N. Korea says another UNSC resolution will trigger corresponding measures
5959 , title : Japan says this is a time to raise pressure on North Korea
5315 , title : Trump, North Korea trade escalating threats of fire
cluster 9
259 , title : Former sex slave, ‘Dokdo Shrimp’ at Trump dinner spark new Korea-Japan row
162 , title : Korean FM to visit China to discuss preparations for summit
2314 , title : S. Korean lawmaker raises expectations that Moon, Abe will resume 'shuttle diplomacy'
130 , title : Korean, Philippine leaders agree to improve ties, better protect nationals
2503 , title : Turkish PM promises to support Korean companies as if they're domestic firms