This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
minibatch = False | |
true_k = 4 | |
if minibatch: | |
km = MiniBatchKMeans(n_clusters = true_k, init='k-means++', n_init=1, | |
init_size=1000, batch_size=1000, verbose = False) | |
else: | |
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, | |
verbose = False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("Extracting features from the training dataset using a sparse vectorizer") | |
t0 = time() | |
vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 10000, | |
min_df = 2, stop_words = 'english', | |
use_idf = True) | |
X = vectorizer.fit_transform(dataset.data) | |
print("done in %fs" % (time() - t0)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load some categories from the training set | |
categories = [ | |
'alt.atheism', | |
'talk.religion.misc', | |
'comp.graphics', | |
'sci.space', | |
] | |
# Uncomment the following to do the analysis on all the categories | |
# categories = None |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bucket = <your-bucket> | |
prefix = 'topic-kmeans' | |
import warnings | |
warnings.simplefilter("ignore") | |
import os | |
import boto3 | |
import sagemaker | |
import numpy as np |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
label_kategori = ['senin', 'selasa', 'rabu', 'kamis', 'jumat', 'sabtu', 'minggu'] | |
encoder = preprocessing.LabelEncoder() | |
encoder.fit(label_kategori) | |
print("\nLabel mapping:") | |
for i, item in enumerate(encoder.classes_): | |
print(item, '>', i) | |
output: | |
Label mapping: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data_normalized_l1 = preprocessing.normalize(input, norm='l1') | |
data_normalized_l2 = preprocessing.normalize(input, norm='l2') | |
print("\nL1 normalized data:\n", data_normalized_l1) | |
print("\nL2 normalized data:\n", data_normalized_l2) | |
L1 normalized data: | |
[[ 0.61538462 0.15384615 0.23076923] | |
[ 0.33333333 0.58333333 0.08333333] | |
[ 0.52941176 0.11764706 0.35294118]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1)) | |
data_scaled_minmax = data_scaler_minmax.fit_transform(input) | |
data_scaled_minmax | |
output: | |
array([[ 0.8, 0. , 0.4], | |
[ 0. , 1. , 0. ], | |
[ 1. , 0. , 1. ]]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sebelum mean removal | |
print("Mean = ", input.mean(), "\n", "Std deviation = ", input.std(axis = 0)) | |
output: | |
Mean = 4.66666666667 | |
Std deviation = [ 2.1602469 2.3570226 2.05480467] | |
# Sesudah mean removal | |
data_scaled = preprocessing.scale(input) | |
print("Mean = ", data_scaled) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
input = np.array([[8,2,3], | |
[4,7,1], | |
[9,2,6]]) | |
data_biner = preprocessing.Binarizer(threshold = 5).transform(input) | |
output: | |
[[1 0 0] | |
[0 1 0] | |
[1 0 1]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import preprocessing |
NewerOlder