Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Created December 2, 2018 10:47
Show Gist options
  • Save bowbowbow/192b987155858b8e01bf818c04331d49 to your computer and use it in GitHub Desktop.
Save bowbowbow/192b987155858b8e01bf818c04331d49 to your computer and use it in GitHub Desktop.
import json, re, datetime
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances_argmin_min
def data_load():
frames = []
for i in range(0, 8):
with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f:
data = json.load(f)
data['year'] = dict()
data['timestamp'] = dict()
for doc_id in data[' body']:
time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S')
data['year'][doc_id] = time.year
data['timestamp'][doc_id] = time.month * 30 + time.day
df = pd.DataFrame.from_dict(data)
# header: [' author' ' body' ' description' ' section' ' time' 'title']
# print(df.columns.values)
frames.append(df)
return pd.concat(frames)
def chunking(text):
chunks = []
parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}")
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
# words = [word for word in words if word not in stopwords]
tags = nltk.pos_tag(words)
tree = parser.parse(tags)
leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"]
for leave in leaves:
chunk = [word[0] for word in leave]
chunks.append('_'.join(chunk))
return chunks
def tokenize_and_stem(text):
stems = []
for sent in nltk.sent_tokenize(text):
words = [word for word in nltk.word_tokenize(sent) if word.isalpha()]
words = [word for word in words if word not in stopwords]
for word in words: stems.append(stemmer.stem(word))
return stems
def clustering(year):
df = data_load()
df = df[df.year == year]
print(df[:5].to_string())
encoder = LabelEncoder()
encoder.fit(df[' author'])
df[' author'] = encoder.transform(df[' author'])
encoder.fit(df[' section'])
df[' section'] = encoder.transform(df[' section'])
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[[self.key]]
vectorizer = FeatureUnion(
transformer_list=[
('title', Pipeline([
('selector', TextSelector(key='title')),
('tfidf', TfidfVectorizer(min_df=0.1, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('body', Pipeline([
('selector', TextSelector(key=' body')),
('tfidf', TfidfVectorizer(min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1, 2)))
])),
('author', Pipeline([
('selector', NumberSelector(key=' author')),
('onehot', OneHotEncoder(categories='auto'))
])),
('section', Pipeline([
('selector', NumberSelector(key=' section')),
('onehot', OneHotEncoder(categories='auto'))
])),
('timestamp', Pipeline([
('selector', NumberSelector(key='timestamp')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'section': 1.0,
'title': 0.6,
'timestamp': 0.5,
'body': 0.5,
'author': 0.3,
},
)
X = vectorizer.fit_transform(df)
true_k = 10
from sklearn.cluster import KMeans
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
df['cluster'] = model.labels_
closest, _ = pairwise_distances_argmin_min(model.cluster_centers_, X)
print('closest :', closest)
from summa import keywords
from summa.summarizer import summarize
for c in range(true_k):
print('cluster {}'.format(c))
doc_id = closest[c]
doc = df.iloc[doc_id]
print('title:', doc['title'])
chunking(doc[' body'])
# for c in range(true_k):
# print('cluster {}: '.format(c))
# cdf = df[df.cluster == c]
#
# chunks = []
#
# from collections import Counter
# for index, row in cdf.iterrows():
# title = row['title']
# body = row[' body']
#
# chunks += chunking(title)
# # chunks += chunking(body)
#
# counts = Counter(chunks)
# print('counts :', counts)
if __name__ == "__main__":
clustering(year=2017)
@bowbowbow
Copy link
Author

cluster 0
title: ADHD drugs abused as study aids
cluster 1
title: [Graphic News] Former President Park Geun-hye‘s grilling by the prosecution
cluster 2
title: [News Analysis] Moon’s peace vision faces bleak outlook
cluster 3
title: Ex-UN chief expected to join political party before long
cluster 4
title: N. Korea media warns of 'catastrophe' over joint S. Korea-US drills
cluster 5
title: Presidential debate turns nasty as candidates focus on political assaults
cluster 6
title: Arrest warrant sought for former finance minister over NIS fund scandal
cluster 7
title: S. Korea's foreign minister urges N. Korea to respond to offer for talks
cluster 8
title: N. Korean man arrested in Kim Jong-nam murder case
cluster 9
title: Korean president highlights importance of Russia, ASEAN, EU

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment