Created
December 2, 2018 14:31
-
-
Save bowbowbow/ac73a5e01f91a47600816d590504a2c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json, re, datetime | |
| import pandas as pd | |
| import numpy as np | |
| import nltk | |
| from nltk.stem.snowball import SnowballStemmer | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| stemmer = SnowballStemmer("english") | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import pairwise_distances_argmin_min, pairwise | |
| def data_load(): | |
| frames = [] | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
| data = json.load(f) | |
| data['year'] = dict() | |
| data['timestamp'] = dict() | |
| for doc_id in data[' body']: | |
| time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S') | |
| data['year'][doc_id] = time.year | |
| data['timestamp'][doc_id] = (time.month * 30 + time.day) / 100.0 | |
| df = pd.DataFrame.from_dict(data) | |
| # header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
| # print(df.columns.values) | |
| frames.append(df) | |
| return pd.concat(frames) | |
| def chunking(text): | |
| chunks = [] | |
| parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| # words = [word for word in words if word not in stopwords] | |
| tags = nltk.pos_tag(words) | |
| tree = parser.parse(tags) | |
| leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
| for leave in leaves: | |
| chunk = [word[0] for word in leave] | |
| chunks.append('_'.join(chunk)) | |
| return chunks | |
| def get_proppers(text): | |
| propernouns = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| tags = nltk.pos_tag(words) | |
| propernouns += [word for word, pos in tags if pos == 'NNP' or pos == 'NNPS'] | |
| return propernouns | |
| def tokenize_and_stem(text): | |
| stems = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| words = [word for word in words if word not in stopwords] | |
| for word in words: stems.append(stemmer.stem(word)) | |
| return stems | |
| def clustering(year): | |
| df = data_load() | |
| df = df[df.year == year] | |
| # print(df[:5].to_string()) | |
| encoder = LabelEncoder() | |
| encoder.fit(df[' author']) | |
| df[' author'] = encoder.transform(df[' author']) | |
| encoder.fit(df[' section']) | |
| df[' section'] = encoder.transform(df[' section']) | |
| class TextSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[self.key] | |
| class NumberSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[[self.key]] | |
| vectorizer = FeatureUnion( | |
| transformer_list=[ | |
| ('proppers', Pipeline([ | |
| ('selector', TextSelector(key=' body')), | |
| ('tfidf', TfidfVectorizer(tokenizer=get_proppers, ngram_range=(1, 2))) | |
| ])), | |
| ('title', Pipeline([ | |
| ('selector', TextSelector(key='title')), | |
| ('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('body', Pipeline([ | |
| ('selector', TextSelector(key=' body')), | |
| ('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| ('author', Pipeline([ | |
| ('selector', NumberSelector(key=' author')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('section', Pipeline([ | |
| ('selector', NumberSelector(key=' section')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('timestamp', Pipeline([ | |
| ('selector', NumberSelector(key='timestamp')), | |
| ])), | |
| ], | |
| # weight components in FeatureUnion | |
| transformer_weights={ | |
| 'section': 3.0, | |
| 'proppers': 2.0, | |
| 'title': 1.0, | |
| 'body': 1.0, | |
| 'timestamp': 0.5, | |
| 'author': 0.3, | |
| }, | |
| ) | |
| X = vectorizer.fit_transform(df) | |
| true_k = 10 | |
| from sklearn.cluster import KMeans | |
| model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
| model.fit(X) | |
| df['cluster'] = model.labels_ | |
| closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X) | |
| print('closest :', closest) | |
| for c in range(true_k): | |
| print('cluster {}'.format(c)) | |
| dis = model.transform(X)[:, c] | |
| dis = [(i, dis[i]) for i in range(len(dis))] | |
| dis = sorted(dis, key=lambda x: x[1]) | |
| for item in dis[:5]: | |
| doc_id = item[0] | |
| print(doc_id, ', title :', df.iloc[doc_id]['title']) | |
| # print(chunking(doc[' body'])) | |
| if __name__ == "__main__": | |
| clustering(year=2017) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
fix max_df
2017
cluster 0
8447 , title : NK 84% likely to conduct nuclear or missile tests in next 30 days: CSIS
7921 , title : UN Security Council unanimously adopts statement condemning NK missile launch
8599 , title : Rocket engine test puts NK closer to launching ICBM, satellite: report
8577 , title : NK vows to take 'toughest' military actions as US sends aircraft carrier
8600 , title : N. Korea may conduct nuke test before launch of new gov't in Seoul: expert
cluster 1
6219 , title : [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
6108 , title : [Graphic News] Salvage operation procedure
6603 , title : [Graphic News] Four possible scenarios
9109 , title : 70% of Korea’s fine dust particles come from China: study
8540 , title : Sewol ferry moved on land, search for missing victims begins
cluster 2
5145 , title : [Graphic News] Election in numbers
7843 , title : Presidential front-runner Moon widens gap with runner-up: poll
8282 , title : Hwang calls for national unity amid NK threats
4521 , title : Cheong Wa Dae names two deputy advisers for national security
8350 , title : Ahn gives up parliamentary seat ahead of presidential election
cluster 3
2456 , title : [Graphic News] A day in Seoul in numbers
555 , title : Top 10 national news stories
459 , title : Prosecutors seek arrest warrants for two Park aides over bribery
886 , title : Former NIS official arrested in probe of political interference
839 , title : Prosecution requests arrest warrants for 3 ex-NIS officials
cluster 4
1435 , title : Top diplomats of S. Korea, US discuss Kim Jong-un's threat of highest-level action
1043 , title : Senior diplomats of S. Korea, US to discuss NK nuke issue
1007 , title : Senior diplomats of S. Korea, US to discuss NK nuke issue
1450 , title : Kim Jong-un warns Trump will pay dearly for threat
1598 , title : Tillerson: US seeks peaceful solution to NK issue
cluster 5
3043 , title : Moon's approval rating slightly slips but still at record high level
5437 , title : Moon's approval rating slips to 72.5%
3230 , title : Moon's rating further gains amid growing N. Korea tension
3827 , title : Conservative parties quicken preparations for leadership elections
3670 , title : Parties agree to normalize parliamentary sessions
cluster 6
4117 , title : Korea to take more responsibility for dementia patients
4218 , title : 4 in 10 older workers want post-retirement jobs: poll
4347 , title : Number of China-bound travelers drops 42% on THAAD spat: data
4428 , title : Suspected vessel hijacking false alarm, crew confirmed safe
4303 , title : Poll shows 60% of Koreans negative about China's influence
cluster 7
7810 , title : Nuclear envoys of S. Korea, US, Japan to meet next week to discuss NK issue
8415 , title : Top diplomats of Korea and Angola to meet next week to discuss cooperation
8225 , title : S. Korea's top diplomat to attend UNSC meeting on NK issue
8076 , title : Top diplomats of Korea, Tanzania to hold talks in Seoul next week
8245 , title : Vice unification minister meets with Japan's top envoy over NK issue
cluster 8
7812 , title : [Graphic News] US forces in South Korea
8084 , title : [Graphic News] Military strength of the two Koreas
5926 , title : US naval commander to get Korean name
3669 , title : Gen. Brooks reaffirms S. Korea-U.S. alliance will not waver in future
4378 , title : New S. Korean Air Force chief stresses readiness to counter NK threats
cluster 9
5145 , title : [Graphic News] Election in numbers
3980 , title : Korea calls for Japan to be careful in making comments on comfort women issue
3551 , title : Japanese diplomat under fire for calling sexual slavery victims 'paid prostitutes'
3523 , title : Korea denounces Japan diplomat's disparaging remarks on sex slavery victims
3939 , title : Korea concludes sex slavery victims still have individual rights to sue Japan despite gov't deal
2016
cluster 0
1840 , title : Hungry N. Korean soldiers committing various crimes: report
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
534 , title : NK resumes encrypted numbers broadcast after 2-week hiatus
293 , title : NK resumes encrypted numbers broadcast after 9-day break
479 , title : NK spy agency official defected to S. Korea last year: source
cluster 1
5171 , title : Labor union leader gets five-year jail term for violent rallies
4901 , title : Korea to toughen monitoring on Zika virus
4927 , title : Korea reports first Zika virus-infected patient
4679 , title : Court rules ban on voluntary prostitution constitutional
4712 , title : [EXCLUSIVE] Korea’s justice system fails foreign victims of rape
cluster 2
3173 , title : Defense ministry counters health, diplomatic concerns over THAAD deployment
5752 , title : Defense ministry demands 5.3% increase in defense budget for 2017
3295 , title : Korea to announce site for deploying THAAD: defense ministry
5626 , title : Senate defense budget bill fails to include amendment calling for THAAD deployment in S. Korea
3939 , title : Park strongly defends THAAD deployment decision
cluster 3
5693 , title : U.N. chief remains ahead of other presidential hopefuls: survey
101 , title : ‘Park should own up’
2368 , title : [Graphic News] What Park should do?
1988 , title : Cheong Wa Dae probes aide’s scandal
1760 , title : Saenuri starts leadership vote amid factional divide
cluster 4
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
1840 , title : Hungry N. Korean soldiers committing various crimes: report
5498 , title : U.S. 'strongly condemns' N. Korea's missile launches
5276 , title : N.K. leader given new state title at key parliamentary meeting
5500 , title : S. Korea condemns N. Korea's launches of 2 mid-range missiles
cluster 5
5728 , title : [HERALD INTERVIEW] ‘Long-term engagement key to investing in Iran’
5719 , title : 'Gateway to Korea' opens for EU firms
3063 , title : Azerbaijan celebrates independence, ties with Korea
5560 , title : Azerbaijan, Korea to cooperate on infrastructure, energy
5718 , title : ASEAN promotes ecotourism at seminar
cluster 6
5621 , title : Bulgaria reaffirms support for denuclearization, vows to fully implement N. Korea sanctions
1840 , title : Hungry N. Korean soldiers committing various crimes: report
7330 , title : N.K. leader orders more nuke tests, readiness for nuclear attacks
6582 , title : South Korea to unveil its own sanctions on North Korea this week
7016 , title : North Korea must pay 'necessary price' for nuke test, rocket launch: China FM
cluster 7
5731 , title : Chinese fishing boats leave neutral waters after crackdown
3059 , title : Trump renews calls for allies to pay up for U.S. protection
3424 , title : Republican lawmaker calls for Trump to refine Asian policy
3058 , title : Trump raps Obama for visiting Hiroshima visit without noting Japan's aggression
5867 , title : Financial institutions should choose either doing business with North Korea or U.S.: Royce
cluster 8
4251 , title : Only permanent residents to be able to apply for naturalization in S. Korea
2093 , title : Court again rejects Korean A-bomb victims' suit against gov't
3602 , title : Korea to control e-cigarettes in antismoking policy
3632 , title : No. of visitors on Jeju tops 5 mln
2118 , title : Survey shows signs of improving S. Korea-Japan relations
cluster 9
5171 , title : Labor union leader gets five-year jail term for violent rallies
699 , title : Senior prosecutor arrested over bribery allegations
696 , title : SNU professor gets 2-year jail term for fabricating Oxy reports
739 , title : Man gets one-year term for fake threat to blow up Incheon airport
966 , title : 8 Chinese tourists arrested for assaulting Korean restaurant owner
2015
cluster 0
2467 , title : Korean sexual slavery victim urges Obama to guide Abe onto right path
2386 , title : U.S. official due in Seoul over missile defense
2693 , title : World War II sex slave to sue Japan in U.S. court
2779 , title : S. Korea urges closer regional ties on energy security
2436 , title : China invites Koreas' militaries to parade marking end of WWII
cluster 1
4079 , title : All but one MERS-related patients move out of intensive care
1785 , title : S. Korea reports no additional MERS cases for 30th
4016 , title : MERS victims file lawsuits against gov't, hospitals
3343 , title : S. Korea‘s last MERS patient tests negative
3650 , title : Child abuse claims 12 lives, over 5,000 victims in H1: report
cluster 2
7136 , title : Opposition party unveils own proposal for pension reform
4397 , title : Park calls for compromise on labor, pension reforms
6670 , title : Ruling party chief willing to cooperate with corruption scandal probe
6991 , title : Park renews calls for reform of labor market, civil service pensions
6692 , title : Park orders thorough probe into bribery scandal
cluster 3
1540 , title : Exhibition to showcase Danish green solutions
1398 , title : Exhibition shows Danish green tech
1541 , title : Pakistan marks independence with eye on global affairs
1395 , title : Czech Republic, Korea fete silver jubilee through architecture
1397 , title : ASEAN, Korean students plant seeds of sustainable forestry
cluster 4
4451 , title : Ex-Navy chief arrested over alleged graft
4667 , title : Prosecutors raid POSCO E&C over alleged slush funds
7116 , title : Actor Lee Byung-hun's blackmailers get suspended jail terms
4392 , title : Families of ferry disaster victims call for vessel recovery
4563 , title : Subcontractors of POSCO E&C raided in slush fund probe
cluster 5
6224 , title : [Graphic News] Comparison of two Koreas’ military strength
2442 , title : U.S. Army holds public hearings on proposal to permanently deploy THAAD to Guam
2311 , title : Price, favorable terms to be basics of S. Korea's defense
2315 , title : Military officer accused of leaking military info to China
2152 , title : U.S. approves foreign military sale for S. Korea's KF-16 upgrade project for $2.5 bln
cluster 6
2948 , title : S. Korea, Japan to meet over wartime sex slaves
3004 , title : Japanese scholars urge Abe to offer apology for history
2753 , title : FM Yun set for fence-mending trip to Japan
3016 , title : S. Korea, Japan to hold talks on world heritage
2619 , title : Any solution to sex slave issue should satisfy victims: FM Yun
cluster 7
2952 , title : MERS feared to dent consumption, growth: foreign investors
2971 , title : Supporters of sexual minorities hold event in Seoul
3111 , title : Experts say S. Korea can contain MERS with quarantine
3096 , title : S. Korea forms MERS task force on national image
3472 , title : Unemployment among youth with no job experience hits 12-year high
cluster 8
2439 , title : Drought-hit North Korea seeks aid from ally Iran
2516 , title : N. Korea slams S. Korea over U.N. human rights office
2591 , title : U.S. hails opening of U.N. human rights office in Seoul
2797 , title : China urges easing of tension as N. Korea tests missiles
2732 , title : N.K. nuclear reactor running at low levels or not at all: 38 North
cluster 9
1222 , title : Park's approval rating edges down amid history textbook row
1186 , title : Park's approval rating edges down amid history textbook row
3770 , title : Park's approval rating edges down amid factional feud
2777 , title : Park's approval rating edges up as factional feud subsides
1356 , title : Park's approval rating edges up after U.S. visit