Created
December 2, 2018 10:47
-
-
Save bowbowbow/192b987155858b8e01bf818c04331d49 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, re, datetime | |
import pandas as pd | |
import nltk | |
from nltk.stem.snowball import SnowballStemmer | |
stopwords = nltk.corpus.stopwords.words('english') | |
stemmer = SnowballStemmer("english") | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.pipeline import FeatureUnion, Pipeline | |
from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics import pairwise_distances_argmin_min | |
def data_load(): | |
frames = [] | |
for i in range(0, 8): | |
with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
data = json.load(f) | |
data['year'] = dict() | |
data['timestamp'] = dict() | |
for doc_id in data[' body']: | |
time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S') | |
data['year'][doc_id] = time.year | |
data['timestamp'][doc_id] = time.month * 30 + time.day | |
df = pd.DataFrame.from_dict(data) | |
# header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
# print(df.columns.values) | |
frames.append(df) | |
return pd.concat(frames) | |
def chunking(text): | |
chunks = [] | |
parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
for sent in nltk.sent_tokenize(text): | |
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
# words = [word for word in words if word not in stopwords] | |
tags = nltk.pos_tag(words) | |
tree = parser.parse(tags) | |
leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
for leave in leaves: | |
chunk = [word[0] for word in leave] | |
chunks.append('_'.join(chunk)) | |
return chunks | |
def tokenize_and_stem(text): | |
stems = [] | |
for sent in nltk.sent_tokenize(text): | |
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
words = [word for word in words if word not in stopwords] | |
for word in words: stems.append(stemmer.stem(word)) | |
return stems | |
def clustering(year): | |
df = data_load() | |
df = df[df.year == year] | |
print(df[:5].to_string()) | |
encoder = LabelEncoder() | |
encoder.fit(df[' author']) | |
df[' author'] = encoder.transform(df[' author']) | |
encoder.fit(df[' section']) | |
df[' section'] = encoder.transform(df[' section']) | |
class TextSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, key): | |
self.key = key | |
def fit(self, x, y=None): | |
return self | |
def transform(self, data_dict): | |
return data_dict[self.key] | |
class NumberSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, key): | |
self.key = key | |
def fit(self, x, y=None): | |
return self | |
def transform(self, data_dict): | |
return data_dict[[self.key]] | |
vectorizer = FeatureUnion( | |
transformer_list=[ | |
('title', Pipeline([ | |
('selector', TextSelector(key='title')), | |
('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
])), | |
('body', Pipeline([ | |
('selector', TextSelector(key=' body')), | |
('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
])), | |
('author', Pipeline([ | |
('selector', NumberSelector(key=' author')), | |
('onehot', OneHotEncoder(categories='auto')) | |
])), | |
('section', Pipeline([ | |
('selector', NumberSelector(key=' section')), | |
('onehot', OneHotEncoder(categories='auto')) | |
])), | |
('timestamp', Pipeline([ | |
('selector', NumberSelector(key='timestamp')), | |
])), | |
], | |
# weight components in FeatureUnion | |
transformer_weights={ | |
'section': 1.0, | |
'title': 0.6, | |
'timestamp': 0.5, | |
'body': 0.5, | |
'author': 0.3, | |
}, | |
) | |
X = vectorizer.fit_transform(df) | |
true_k = 10 | |
from sklearn.cluster import KMeans | |
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
model.fit(X) | |
df['cluster'] = model.labels_ | |
closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X) | |
print('closest :', closest) | |
from summa import keywords | |
from summa.summarizer import summarize | |
for c in range(true_k): | |
print('cluster {}'.format(c)) | |
doc_id = closest[c] | |
doc = df.iloc[doc_id] | |
print('title:', doc['title']) | |
chunking(doc[' body']) | |
# for c in range(true_k): | |
# print('cluster {}: '.format(c)) | |
# cdf = df[df.cluster == c] | |
# | |
# chunks = [] | |
# | |
# from collections import Counter | |
# for index, row in cdf.iterrows(): | |
# title = row['title'] | |
# body = row[' body'] | |
# | |
# chunks += chunking(title) | |
# # chunks += chunking(body) | |
# | |
# counts = Counter(chunks) | |
# print('counts :', counts) | |
if __name__ == "__main__": | |
clustering(year=2017) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cluster 0
title: ADHD drugs abused as study aids
cluster 1
title: [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
cluster 2
title: [News Analysis] Moon’s peace vision faces bleak outlook
cluster 3
title: Ex-UN chief expected to join political party before long
cluster 4
title: N. Korea media warns of 'catastrophe' over joint S. Korea-US drills
cluster 5
title: Presidential debate turns nasty as candidates focus on political assaults
cluster 6
title: Arrest warrant sought for former finance minister over NIS fund scandal
cluster 7
title: S. Korea's foreign minister urges N. Korea to respond to offer for talks
cluster 8
title: N. Korean man arrested in Kim Jong-nam murder case
cluster 9
title: Korean president highlights importance of Russia, ASEAN, EU