Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Last active December 2, 2018 13:57
Show Gist options
  • Select an option

  • Save bowbowbow/16d1ec6778a35626867569cd1c285940 to your computer and use it in GitHub Desktop.

Select an option

Save bowbowbow/16d1ec6778a35626867569cd1c285940 to your computer and use it in GitHub Desktop.
import json, re, datetime
import pandas as pd
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances_argmin_min, pairwise
def data_load():
frames = []
for i in range(0, 8):
with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f:
data = json.load(f)
data['year'] = dict()
data['timestamp'] = dict()
for doc_id in data[' body']:
time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S')
data['year'][doc_id] = time.year
data['timestamp'][doc_id] = (time.month * 30 + time.day) / 100.0
df = pd.DataFrame.from_dict(data)
# header: [' author' ' body' ' description' ' section' ' time' 'title']
# print(df.columns.values)
frames.append(df)
return pd.concat(frames)
def chunking(text):
chunks = []
parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}")
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
# words = [word for word in words if word not in stopwords]
tags = nltk.pos_tag(words)
tree = parser.parse(tags)
leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"]
for leave in leaves:
chunk = [word[0] for word in leave]
chunks.append('_'.join(chunk))
return chunks
def tokenize_and_stem(text):
stems = []
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
words = [word for word in words if word not in stopwords]
for word in words: stems.append(stemmer.stem(word))
return stems
def clustering(year):
df = data_load()
df = df[df.year == year]
# print(df[:5].to_string())
encoder = LabelEncoder()
encoder.fit(df[' author'])
df[' author'] = encoder.transform(df[' author'])
encoder.fit(df[' section'])
df[' section'] = encoder.transform(df[' section'])
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[[self.key]]
vectorizer = FeatureUnion(
transformer_list=[
('title', Pipeline([
('selector', TextSelector(key='title')),
('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('body', Pipeline([
('selector', TextSelector(key=' body')),
('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('author', Pipeline([
('selector', NumberSelector(key=' author')),
('onehot', OneHotEncoder(categories='auto'))
])),
('section', Pipeline([
('selector', NumberSelector(key=' section')),
('onehot', OneHotEncoder(categories='auto'))
])),
('timestamp', Pipeline([
('selector', NumberSelector(key='timestamp')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'section': 3.0,
'title': 1.0,
'body': 1.0,
'timestamp': 0.5,
'author': 0.3,
},
)
X = vectorizer.fit_transform(df)
true_k = 10
from sklearn.cluster import KMeans
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
df['cluster'] = model.labels_
closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X)
print('closest :', closest)
for c in range(true_k):
print('cluster {}'.format(c))
dis = model.transform(X)[:, c]
dis = [(i, dis[i]) for i in range(len(dis))]
dis = sorted(dis, key=lambda x: x[1])
for item in dis[:5]:
doc_id = item[0]
print(doc_id, ', title :', df.iloc[doc_id]['title'])
# print(chunking(doc[' body']))
if __name__ == "__main__":
clustering(year=2017)
@bowbowbow
Copy link
Author

cluster 0
title: Hwang urges Japan to show sincere remorse over shared history
cluster 1
title: S. Korea likely to propose talks to NK following Moon's peace gesture
cluster 2
title: Authorities deny allegation of mistreatment of jailed ex-president
cluster 3
title: Conservatives jostle for position to challenge frontrunner Moon
cluster 4
title: [Graphic News] Salvage operation procedure
cluster 5
title: Ruling party renews calls for main opposition to lift parliamentary boycott
cluster 6
title: Acting president renews vow to carry out THAAD deployment
cluster 7
title: Russian envoy calls for restraint amid tensions on Korean Peninsula
cluster 8
title: Allies to conduct military drills similar to last year‘s: military
cluster 9
title: PyeongChang Games may help mend soured Sino-Korean ties: experts

@bowbowbow
Copy link
Author

cluster 0
6108 , title : [Graphic News] Salvage operation procedure
6219 , title : [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
6603 , title : [Graphic News] Four possible scenarios
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
4876 , title : Gwanghwamun rises as center of democracy
cluster 1
3829 , title : NK says int'l sanctions hinder delivery of aid to Pyongyang
3068 , title : No evidence of money from Kaesong complex being used for NK nukes: Seoul official
5704 , title : Growing signs of NK missile test deal blow to engagement approach
5924 , title : Seoul views inter-Korean dialogue differently from NK nuke talks: official
3605 , title : Coordination over Seoul, Washington's NK policy key task for allies
cluster 2
5145 , title : [Graphic News] Election in numbers
3970 , title : [News Focus] Moon's extra budget speech fails to win over opposition
3930 , title : Opposition cries foul on Kim appointment
8563 , title : Legal battles crowd presidential race
4813 , title : [Newsmaker] Moon Jae-in names special envoys, adds Europe to list
cluster 3
2040 , title : Talk of tactical nuclear weapons resurfaces
3866 , title : Allies to conduct military drills similar to last year‘s: military
5344 , title : Moon Jae-in urges ‘complete’ overhaul of military
3422 , title : S. Korea, US begin military drills amid N. Korea's threats
3789 , title : JCS chief nominee says no plan to scale back Korea-US military drill
cluster 4
6656 , title : China's top diplomat renews strong objection to THAAD deployment
8474 , title : Hwang urges Japan to show sincere remorse over shared history
8546 , title : [Newsmaker] Chinese envoy’s anti-THAAD campaign double-edged sword
8785 , title : Recalled Japanese envoy to return to Seoul
8601 , title : Japan envoy meets vice foreign minister amid girl statue controversy
cluster 5
3349 , title : Moon, Xi likely to meet in Germany this week to discuss THAAD controversy
5092 , title : Daunting diplomatic challenges with Trump, China await next S.Korean president
3397 , title : Moon's efforts for inter-Korean ties to gain pace with Trump's support
3350 , title : Russia hopes Korean president to visit the country in Sept.
5070 , title : THAAD deployment could raise cost-sharing questions: CRS report
cluster 6
6334 , title : Tillerson to visit DMZ as part of Korean tour
8940 , title : Russian envoy calls for restraint amid tensions on Korean Peninsula
8871 , title : Trump to host China's Xi at Florida resort next week
8703 , title : Secondary sanctions 'early topic' for Trump-Xi summit: senior White House official
4929 , title : Reopening of Kaesong complex requires UN sanctions-related review: official
cluster 7
2456 , title : [Graphic News] A day in Seoul in numbers
555 , title : Top 10 national news stories
3632 , title : [URGENT] Choi Soon-sil gets 3-year prison term for college entrance irregularities
663 , title : [Frome the Scene] Pilot ‘right to die’ program gets warm reception
1402 , title : [Feature] Gosiwon, modern time refuge for house poor
cluster 8
5440 , title : N. Korea may carry out more provocations despite UN resolutions: experts
5388 , title : N. Korea threatens 'physical' actions over new UN sanctions
3032 , title : N. Korea says another UNSC resolution will trigger corresponding measures
5959 , title : Japan says this is a time to raise pressure on North Korea
5315 , title : Trump, North Korea trade escalating threats of fire
cluster 9
259 , title : Former sex slave, ‘Dokdo Shrimp’ at Trump dinner spark new Korea-Japan row
162 , title : Korean FM to visit China to discuss preparations for summit
2314 , title : S. Korean lawmaker raises expectations that Moon, Abe will resume 'shuttle diplomacy'
130 , title : Korean, Philippine leaders agree to improve ties, better protect nationals
2503 , title : Turkish PM promises to support Korean companies as if they're domestic firms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment