Created
December 2, 2018 09:17
-
-
Save bowbowbow/040dcc8598ebcfe21161fd69fee385b1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json, re, datetime | |
| import pandas as pd | |
| import nltk | |
| from nltk.stem.snowball import SnowballStemmer | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| stemmer = SnowballStemmer("english") | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| def data_load(): | |
| frames = [] | |
| for i in range(0, 8): | |
| with open('./data/koreaherald_1517_{}.json'.format(i), 'r') as f: | |
| data = json.load(f) | |
| data['year'] = dict() | |
| data['timestamp'] = dict() | |
| for doc_id in data[' body']: | |
| time = datetime.datetime.strptime(data[' time'][doc_id], '%Y-%m-%d %H:%M:%S') | |
| data['year'][doc_id] = time.year | |
| data['timestamp'][doc_id] = time.month * 30 + time.day | |
| df = pd.DataFrame.from_dict(data) | |
| # header: [' author' ' body' ' description' ' section' ' time' 'title'] | |
| # print(df.columns.values) | |
| frames.append(df) | |
| return pd.concat(frames) | |
| def chunking(text): | |
| chunks = [] | |
| parser = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}") | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| # words = [word for word in words if word not in stopwords] | |
| tags = nltk.pos_tag(words) | |
| tree = parser.parse(tags) | |
| leaves = [s.leaves() for s in tree.subtrees() if s.label() == "NP"] | |
| for leave in leaves: | |
| chunk = [word[0] for word in leave] | |
| chunks.append('_'.join(chunk)) | |
| return chunks | |
| def tokenize_and_stem(text): | |
| stems = [] | |
| for sent in nltk.sent_tokenize(text): | |
| words = [word for word in nltk.word_tokenize(sent) if word.isalpha()] | |
| words = [word for word in words if word not in stopwords] | |
| for word in words: stems.append(stemmer.stem(word)) | |
| return stems | |
| def clustering(year): | |
| df = data_load() | |
| df = df[df.year == year] | |
| print(df[:5].to_string()) | |
| encoder = LabelEncoder() | |
| encoder.fit(df[' author']) | |
| df[' author'] = encoder.transform(df[' author']) | |
| encoder.fit(df[' section']) | |
| df[' section'] = encoder.transform(df[' section']) | |
| class TextSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[self.key] | |
| class NumberSelector(BaseEstimator, TransformerMixin): | |
| def __init__(self, key): | |
| self.key = key | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, data_dict): | |
| return data_dict[[self.key]] | |
| vectorizer = FeatureUnion( | |
| transformer_list=[ | |
| ('title', Pipeline([ | |
| ('selector', TextSelector(key='title')), | |
| ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| ])), | |
| # ('body', Pipeline([ | |
| # ('selector', TextSelector(key=' body')), | |
| # ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))) | |
| # ])), | |
| ('author', Pipeline([ | |
| ('selector', NumberSelector(key=' author')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('section', Pipeline([ | |
| ('selector', NumberSelector(key=' section')), | |
| ('onehot', OneHotEncoder(categories='auto')) | |
| ])), | |
| ('timestamp', Pipeline([ | |
| ('selector', NumberSelector(key='timestamp')), | |
| ])), | |
| ], | |
| # weight components in FeatureUnion | |
| transformer_weights={ | |
| # 'timestamp': 1.0, | |
| 'section': 0.8, | |
| 'title': 0.6, | |
| # 'body': 0.3, | |
| 'author': 0.3, | |
| }, | |
| ) | |
| X = vectorizer.fit_transform(df) | |
| # print(vectorizer.transformer_list[0][1].named_steps['tfidf'].get_feature_names()) | |
| true_k = 15 | |
| from sklearn.cluster import KMeans | |
| model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) | |
| model.fit(X) | |
| z = model.predict(vectorizer.transform(df)) | |
| df['cluster'] = z | |
| print(df[:5].to_string()) | |
| for c in range(true_k): | |
| print('cluster {}: '.format(c)) | |
| cdf = df[df.cluster == c] | |
| chunks = [] | |
| from collections import Counter | |
| for index, row in cdf.iterrows(): | |
| title = row['title'] | |
| body = row[' body'] | |
| chunks += chunking(title) | |
| # chunks += chunking(body) | |
| counts = Counter(chunks) | |
| print('counts :', counts) | |
| if __name__ == "__main__": | |
| clustering(year=2017) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment