Andreas Chandra andreaschandra

💼

work at anywhere

Research in NLP & NLU | Member of Jakarta AI Research | Jakarta, Indonesia

andreaschandra / create model

Last active May 14, 2018 08:31

	kmeans_model = 'topic-kmeans'

	sm = boto3.client('sagemaker')

	containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:latest'}

	create_model_response = sm.create_model( ModelName=kmeans_model,
	ExecutionRoleArn=role,
	PrimaryContainer={
	'Image': containers[boto3.Session().region_name],

andreaschandra / gist:f7140f27a7dfd0a34b9d6b232618292b

Created May 14, 2018 07:27

Clustering model

	minibatch = False
	true_k = 4

	if minibatch:
	km = MiniBatchKMeans(n_clusters = true_k, init='k-means++', n_init=1,
	init_size=1000, batch_size=1000, verbose = False)
	else:
	km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
	verbose = False)

andreaschandra / Feature Extraction

Created May 14, 2018 07:26

	print("Extracting features from the training dataset using a sparse vectorizer")
	t0 = time()

	vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 10000,
	min_df = 2, stop_words = 'english',
	use_idf = True)

	X = vectorizer.fit_transform(dataset.data)

	print("done in %fs" % (time() - t0))

andreaschandra / gist:ce8acf6f5d91b77c008a40a3b3c6e916

Created May 14, 2018 07:25

load data

	# Load some categories from the training set
	categories = [
	'alt.atheism',
	'talk.religion.misc',
	'comp.graphics',
	'sci.space',
	]
	# Uncomment the following to do the analysis on all the categories
	# categories = None

andreaschandra / gist:268c632cf81d3c6592b566c01ffd4654

Last active May 14, 2018 10:48

import libraries

	bucket = <your-bucket>
	prefix = 'topic-kmeans'

	import warnings
	warnings.simplefilter("ignore")

	import os
	import boto3
	import sagemaker
	import numpy as np

andreaschandra / label_encoding.py

Last active April 2, 2018 15:43

	label_kategori = ['senin', 'selasa', 'rabu', 'kamis', 'jumat', 'sabtu', 'minggu']
	encoder = preprocessing.LabelEncoder()
	encoder.fit(label_kategori)

	print("\nLabel mapping:")
	for i, item in enumerate(encoder.classes_):
	print(item, '>', i)

	output:
	Label mapping:

andreaschandra / normalization.py

Created April 2, 2018 15:40

	data_normalized_l1 = preprocessing.normalize(input, norm='l1')
	data_normalized_l2 = preprocessing.normalize(input, norm='l2')
	print("\nL1 normalized data:\n", data_normalized_l1)
	print("\nL2 normalized data:\n", data_normalized_l2)

	L1 normalized data:
	[[ 0.61538462 0.15384615 0.23076923]
	[ 0.33333333 0.58333333 0.08333333]
	[ 0.52941176 0.11764706 0.35294118]]

andreaschandra / scaling.py

Created April 2, 2018 15:39

	data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
	data_scaled_minmax = data_scaler_minmax.fit_transform(input)
	data_scaled_minmax

	output:
	array([[ 0.8, 0. , 0.4],
	[ 0. , 1. , 0. ],
	[ 1. , 0. , 1. ]])

andreaschandra / meanremoval.py

Created April 2, 2018 15:38

	# Sebelum mean removal
	print("Mean = ", input.mean(), "\n", "Std deviation = ", input.std(axis = 0))

	output:
	Mean = 4.66666666667
	Std deviation = [ 2.1602469 2.3570226 2.05480467]

	# Sesudah mean removal
	data_scaled = preprocessing.scale(input)
	print("Mean = ", data_scaled)

andreaschandra / biner.py

Created April 2, 2018 15:36

	input = np.array([[8,2,3],
	[4,7,1],
	[9,2,6]])

	data_biner = preprocessing.Binarizer(threshold = 5).transform(input)

	output:
	[[1 0 0]
	[0 1 0]
	[1 0 1]]