Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Created December 2, 2018 14:31
Show Gist options
  • Select an option

  • Save bowbowbow/ac73a5e01f91a47600816d590504a2c1 to your computer and use it in GitHub Desktop.

Select an option

Save bowbowbow/ac73a5e01f91a47600816d590504a2c1 to your computer and use it in GitHub Desktop.
import json, re, datetime
import pandas as pd
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances_argmin_min, pairwise
def data_load():
frames = []
for i in range(0, 8):
with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f:
data = json.load(f)
data['year'] = dict()
data['timestamp'] = dict()
for doc_id in data[' body']:
time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S')
data['year'][doc_id] = time.year
data['timestamp'][doc_id] = (time.month * 30 + time.day) / 100.0
df = pd.DataFrame.from_dict(data)
# header: [' author' ' body' ' description' ' section' ' time' 'title']
# print(df.columns.values)
frames.append(df)
return pd.concat(frames)
def chunking(text):
chunks = []
parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}")
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
# words = [word for word in words if word not in stopwords]
tags = nltk.pos_tag(words)
tree = parser.parse(tags)
leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"]
for leave in leaves:
chunk = [word[0] for word in leave]
chunks.append('_'.join(chunk))
return chunks
def get_proppers(text):
propernouns = []
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
tags = nltk.pos_tag(words)
propernouns += [word for word, pos in tags if pos == 'NNP' or pos == 'NNPS']
return propernouns
def tokenize_and_stem(text):
stems = []
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
words = [word for word in words if word not in stopwords]
for word in words: stems.append(stemmer.stem(word))
return stems
def clustering(year):
df = data_load()
df = df[df.year == year]
# print(df[:5].to_string())
encoder = LabelEncoder()
encoder.fit(df[' author'])
df[' author'] = encoder.transform(df[' author'])
encoder.fit(df[' section'])
df[' section'] = encoder.transform(df[' section'])
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[[self.key]]
vectorizer = FeatureUnion(
transformer_list=[
('proppers', Pipeline([
('selector', TextSelector(key=' body')),
('tfidf', TfidfVectorizer(tokenizer=get_proppers, ngram_range=(1, 2)))
])),
('title', Pipeline([
('selector', TextSelector(key='title')),
('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('body', Pipeline([
('selector', TextSelector(key=' body')),
('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('author', Pipeline([
('selector', NumberSelector(key=' author')),
('onehot', OneHotEncoder(categories='auto'))
])),
('section', Pipeline([
('selector', NumberSelector(key=' section')),
('onehot', OneHotEncoder(categories='auto'))
])),
('timestamp', Pipeline([
('selector', NumberSelector(key='timestamp')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'section': 3.0,
'proppers': 2.0,
'title': 1.0,
'body': 1.0,
'timestamp': 0.5,
'author': 0.3,
},
)
X = vectorizer.fit_transform(df)
true_k = 10
from sklearn.cluster import KMeans
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
df['cluster'] = model.labels_
closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X)
print('closest :', closest)
for c in range(true_k):
print('cluster {}'.format(c))
dis = model.transform(X)[:, c]
dis = [(i, dis[i]) for i in range(len(dis))]
dis = sorted(dis, key=lambda x: x[1])
for item in dis[:5]:
doc_id = item[0]
print(doc_id, ', title :', df.iloc[doc_id]['title'])
# print(chunking(doc[' body']))
if __name__ == "__main__":
clustering(year=2017)
@bowbowbow
Copy link
Author

bowbowbow commented Dec 2, 2018

fix max_df

2017
cluster 0
8447 , title : NK 84% likely to conduct nuclear or missile tests in next 30 days: CSIS
7921 , title : UN Security Council unanimously adopts statement condemning NK missile launch
8599 , title : Rocket engine test puts NK closer to launching ICBM, satellite: report
8577 , title : NK vows to take 'toughest' military actions as US sends aircraft carrier
8600 , title : N. Korea may conduct nuke test before launch of new gov't in Seoul: expert
cluster 1
6219 , title : [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
6108 , title : [Graphic News] Salvage operation procedure
6603 , title : [Graphic News] Four possible scenarios
9109 , title : 70% of Korea’s fine dust particles come from China: study
8540 , title : Sewol ferry moved on land, search for missing victims begins
cluster 2
5145 , title : [Graphic News] Election in numbers
7843 , title : Presidential front-runner Moon widens gap with runner-up: poll
8282 , title : Hwang calls for national unity amid NK threats
4521 , title : Cheong Wa Dae names two deputy advisers for national security
8350 , title : Ahn gives up parliamentary seat ahead of presidential election
cluster 3
2456 , title : [Graphic News] A day in Seoul in numbers
555 , title : Top 10 national news stories
459 , title : Prosecutors seek arrest warrants for two Park aides over bribery
886 , title : Former NIS official arrested in probe of political interference
839 , title : Prosecution requests arrest warrants for 3 ex-NIS officials
cluster 4
1435 , title : Top diplomats of S. Korea, US discuss Kim Jong-un's threat of highest-level action
1043 , title : Senior diplomats of S. Korea, US to discuss NK nuke issue
1007 , title : Senior diplomats of S. Korea, US to discuss NK nuke issue
1450 , title : Kim Jong-un warns Trump will pay dearly for threat
1598 , title : Tillerson: US seeks peaceful solution to NK issue
cluster 5
3043 , title : Moon's approval rating slightly slips but still at record high level
5437 , title : Moon's approval rating slips to 72.5%
3230 , title : Moon's rating further gains amid growing N. Korea tension
3827 , title : Conservative parties quicken preparations for leadership elections
3670 , title : Parties agree to normalize parliamentary sessions
cluster 6
4117 , title : Korea to take more responsibility for dementia patients
4218 , title : 4 in 10 older workers want post-retirement jobs: poll
4347 , title : Number of China-bound travelers drops 42% on THAAD spat: data
4428 , title : Suspected vessel hijacking false alarm, crew confirmed safe
4303 , title : Poll shows 60% of Koreans negative about China's influence
cluster 7
7810 , title : Nuclear envoys of S. Korea, US, Japan to meet next week to discuss NK issue
8415 , title : Top diplomats of Korea and Angola to meet next week to discuss cooperation
8225 , title : S. Korea's top diplomat to attend UNSC meeting on NK issue
8076 , title : Top diplomats of Korea, Tanzania to hold talks in Seoul next week
8245 , title : Vice unification minister meets with Japan's top envoy over NK issue
cluster 8
7812 , title : [Graphic News] US forces in South Korea
8084 , title : [Graphic News] Military strength of the two Koreas
5926 , title : US naval commander to get Korean name
3669 , title : Gen. Brooks reaffirms S. Korea-U.S. alliance will not waver in future
4378 , title : New S. Korean Air Force chief stresses readiness to counter NK threats
cluster 9
5145 , title : [Graphic News] Election in numbers
3980 , title : Korea calls for Japan to be careful in making comments on comfort women issue
3551 , title : Japanese diplomat under fire for calling sexual slavery victims 'paid prostitutes'
3523 , title : Korea denounces Japan diplomat's disparaging remarks on sex slavery victims
3939 , title : Korea concludes sex slavery victims still have individual rights to sue Japan despite gov't deal

2016
cluster 0
1840 , title : Hungry N. Korean soldiers committing various crimes: report
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
534 , title : NK resumes encrypted numbers broadcast after 2-week hiatus
293 , title : NK resumes encrypted numbers broadcast after 9-day break
479 , title : NK spy agency official defected to S. Korea last year: source
cluster 1
5171 , title : Labor union leader gets five-year jail term for violent rallies
4901 , title : Korea to toughen monitoring on Zika virus
4927 , title : Korea reports first Zika virus-infected patient
4679 , title : Court rules ban on voluntary prostitution constitutional
4712 , title : [EXCLUSIVE] Korea’s justice system fails foreign victims of rape
cluster 2
3173 , title : Defense ministry counters health, diplomatic concerns over THAAD deployment
5752 , title : Defense ministry demands 5.3% increase in defense budget for 2017
3295 , title : Korea to announce site for deploying THAAD: defense ministry
5626 , title : Senate defense budget bill fails to include amendment calling for THAAD deployment in S. Korea
3939 , title : Park strongly defends THAAD deployment decision
cluster 3
5693 , title : U.N. chief remains ahead of other presidential hopefuls: survey
101 , title : ‘Park should own up’
2368 , title : [Graphic News] What Park should do?
1988 , title : Cheong Wa Dae probes aide’s scandal
1760 , title : Saenuri starts leadership vote amid factional divide
cluster 4
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
1840 , title : Hungry N. Korean soldiers committing various crimes: report
5498 , title : U.S. 'strongly condemns' N. Korea's missile launches
5276 , title : N.K. leader given new state title at key parliamentary meeting
5500 , title : S. Korea condemns N. Korea's launches of 2 mid-range missiles
cluster 5
5728 , title : [HERALD INTERVIEW] ‘Long-term engagement key to investing in Iran’
5719 , title : 'Gateway to Korea' opens for EU firms
3063 , title : Azerbaijan celebrates independence, ties with Korea
5560 , title : Azerbaijan, Korea to cooperate on infrastructure, energy
5718 , title : ASEAN promotes ecotourism at seminar
cluster 6
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
1840 , title : Hungry N. Korean soldiers committing various crimes: report
7330 , title : N.K. leader orders more nuke tests, readiness for nuclear attacks
6582 , title : South Korea to unveil its own sanctions on North Korea this week
7016 , title : North Korea must pay 'necessary price' for nuke test, rocket launch: China FM
cluster 7
5731 , title : Chinese fishing boats leave neutral waters after crackdown
3059 , title : Trump renews calls for allies to pay up for U.S. protection
3424 , title : Republican lawmaker calls for Trump to refine Asian policy
3058 , title : Trump raps Obama for visiting Hiroshima visit without noting Japan's aggression
5867 , title : Financial institutions should choose either doing business with North Korea or U.S.: Royce
cluster 8
4251 , title : Only permanent residents to be able to apply for naturalization in S. Korea
2093 , title : Court again rejects Korean A-bomb victims' suit against gov't
3602 , title : Korea to control e-cigarettes in antismoking policy
3632 , title : No. of visitors on Jeju tops 5 mln
2118 , title : Survey shows signs of improving S. Korea-Japan relations
cluster 9
5171 , title : Labor union leader gets five-year jail term for violent rallies
699 , title : Senior prosecutor arrested over bribery allegations
696 , title : SNU professor gets 2-year jail term for fabricating Oxy reports
739 , title : Man gets one-year term for fake threat to blow up Incheon airport
966 , title : 8 Chinese tourists arrested for assaulting Korean restaurant owner

2015
cluster 0
2467 , title : Korean sexual slavery victim urges Obama to guide Abe onto right path
2386 , title : U.S. official due in Seoul over missile defense
2693 , title : World War II sex slave to sue Japan in U.S. court
2779 , title : S. Korea urges closer regional ties on energy security
2436 , title : China invites Koreas' militaries to parade marking end of WWII
cluster 1
4079 , title : All but one MERS-related patients move out of intensive care
1785 , title : S. Korea reports no additional MERS cases for 30th
4016 , title : MERS victims file lawsuits against gov't, hospitals
3343 , title : S. Korea‘s last MERS patient tests negative
3650 , title : Child abuse claims 12 lives, over 5,000 victims in H1: report
cluster 2
7136 , title : Opposition party unveils own proposal for pension reform
4397 , title : Park calls for compromise on labor, pension reforms
6670 , title : Ruling party chief willing to cooperate with corruption scandal probe
6991 , title : Park renews calls for reform of labor market, civil service pensions
6692 , title : Park orders thorough probe into bribery scandal
cluster 3
1540 , title : Exhibition to showcase Danish green solutions
1398 , title : Exhibition shows Danish green tech
1541 , title : Pakistan marks independence with eye on global affairs
1395 , title : Czech Republic, Korea fete silver jubilee through architecture
1397 , title : ASEAN, Korean students plant seeds of sustainable forestry
cluster 4
4451 , title : Ex-Navy chief arrested over alleged graft
4667 , title : Prosecutors raid POSCO E&C over alleged slush funds
7116 , title : Actor Lee Byung-hun's blackmailers get suspended jail terms
4392 , title : Families of ferry disaster victims call for vessel recovery
4563 , title : Subcontractors of POSCO E&C raided in slush fund probe
cluster 5
6224 , title : [Graphic News] Comparison of two Koreas’ military strength
2442 , title : U.S. Army holds public hearings on proposal to permanently deploy THAAD to Guam
2311 , title : Price, favorable terms to be basics of S. Korea's defense
2315 , title : Military officer accused of leaking military info to China
2152 , title : U.S. approves foreign military sale for S. Korea's KF-16 upgrade project for $2.5 bln
cluster 6
2948 , title : S. Korea, Japan to meet over wartime sex slaves
3004 , title : Japanese scholars urge Abe to offer apology for history
2753 , title : FM Yun set for fence-mending trip to Japan
3016 , title : S. Korea, Japan to hold talks on world heritage
2619 , title : Any solution to sex slave issue should satisfy victims: FM Yun
cluster 7
2952 , title : MERS feared to dent consumption, growth: foreign investors
2971 , title : Supporters of sexual minorities hold event in Seoul
3111 , title : Experts say S. Korea can contain MERS with quarantine
3096 , title : S. Korea forms MERS task force on national image
3472 , title : Unemployment among youth with no job experience hits 12-year high
cluster 8
2439 , title : Drought-hit North Korea seeks aid from ally Iran
2516 , title : N. Korea slams S. Korea over U.N. human rights office
2591 , title : U.S. hails opening of U.N. human rights office in Seoul
2797 , title : China urges easing of tension as N. Korea tests missiles
2732 , title : N.K. nuclear reactor running at low levels or not at all: 38 North
cluster 9
1222 , title : Park's approval rating edges down amid history textbook row
1186 , title : Park's approval rating edges down amid history textbook row
3770 , title : Park's approval rating edges down amid factional feud
2777 , title : Park's approval rating edges up as factional feud subsides
1356 , title : Park's approval rating edges up after U.S. visit

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment