Last active
June 1, 2021 10:38
-
-
Save ypeleg/76f99fc5d4bb72b0b95d27ad50a2a0ac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import math | |
import pickle | |
import shutil | |
import joblib | |
import warnings | |
import datetime | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from sklearn import cluster | |
from functools import reduce | |
from functools import partial | |
from joblib import dump, load | |
warnings.filterwarnings("ignore") | |
from tqdm.autonotebook import tqdm | |
import tensorflow_probability as tfp | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.exceptions import ConvergenceWarning | |
from sklearn.utils._testing import ignore_warnings | |
from sklearn.mixture import BayesianGaussianMixture | |
from tqdm.contrib.concurrent import process_map, thread_map | |
from tensorflow.keras.layers import * | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.optimizers import Adam | |
def pkl_dump(obj, path): pickle.dump(obj, open(path, 'wb'), protocol=4) | |
def pkl_load(path): return pickle.load(open(path, 'rb')) | |
class ConditionalGenerator: | |
@classmethod | |
def from_dict(cls, in_dict): | |
new_instance = ConditionalGenerator() | |
new_instance.__dict__ = in_dict | |
return new_instance | |
def __init__(self, data=None, output_info=None, log_frequency=None): | |
if data is None or output_info is None or log_frequency is None: | |
return | |
self._model = [] | |
start = 0 | |
skip = False | |
max_interval = 0 | |
counter = 0 | |
for item in output_info: | |
if item[1] == 'tanh': | |
start += item[0] | |
skip = True | |
continue | |
if item[1] == 'softmax': | |
if skip: | |
skip = False | |
start += item[0] | |
continue | |
end = start + item[0] | |
max_interval = max(max_interval, end - start) | |
counter += 1 | |
self._model.append(np.argmax(data[:, start:end], axis=-1)) | |
start = end | |
else: | |
assert 0 | |
assert start == data.shape[1] | |
self._interval = [] | |
self._n_col = 0 | |
self.n_opt = 0 | |
skip = False | |
start = 0 | |
self._p = np.zeros((counter, max_interval)) | |
for item in output_info: | |
if item[1] == 'tanh': | |
skip = True | |
start += item[0] | |
continue | |
if item[1] == 'softmax': | |
if skip: | |
start += item[0] | |
skip = False | |
continue | |
end = start + item[0] | |
tmp = np.sum(data[:, start:end], axis=0) | |
if log_frequency: | |
tmp = np.log(tmp + 1) | |
tmp = tmp / np.sum(tmp) | |
self._p[self._n_col, :item[0]] = tmp | |
self._interval.append((self.n_opt, item[0])) | |
self.n_opt += item[0] | |
self._n_col += 1 | |
start = end | |
else: | |
assert 0 | |
self._interval = np.asarray(self._interval) | |
def _random_choice_prob_index(self, idx): | |
prob = self._p[idx] | |
rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1) | |
return (prob.cumsum(axis=1) > rand).argmax(axis=1) | |
def sample(self, batch_size): | |
if self._n_col == 0: return None | |
col_idx = np.random.choice(np.arange(self._n_col), batch_size) | |
cond = np.zeros((batch_size, self.n_opt), dtype='float32') | |
mask = np.zeros((batch_size, self._n_col), dtype='float32') | |
mask[np.arange(batch_size), col_idx] = 1 | |
opt_idx = self._random_choice_prob_index(col_idx) | |
opt = self._interval[col_idx, 0] + opt_idx | |
cond[np.arange(batch_size), opt] = 1 | |
return cond, mask, col_idx, opt_idx | |
def sample_zero(self, batch_size): | |
if self._n_col == 0: return None | |
vec = np.zeros((batch_size, self.n_opt), dtype='float32') | |
idx = np.random.choice(np.arange(self._n_col), batch_size) | |
for i in range(batch_size): | |
col = idx[i] | |
pick = int(np.random.choice(self._model[col])) | |
vec[i, pick + self._interval[col, 0]] = 1 | |
return vec | |
class ResidualLayer(Layer): | |
def __init__(self, input_dim, output_dim): | |
super(ResidualLayer, self).__init__() | |
self._output_dim = output_dim | |
self._fc = Dense( self._output_dim, input_dim=(input_dim,), kernel_initializer=partial(init_bounded, dim=input_dim), bias_initializer=partial(init_bounded, dim=input_dim)) | |
self._bn = BatchNormalization(epsilon=1e-5, momentum=0.9) | |
self._relu = ReLU() | |
def call(self, inputs, **kwargs): | |
outputs = self._fc(inputs, **kwargs) | |
outputs = self._bn(outputs, **kwargs) | |
outputs = self._relu(outputs, **kwargs) | |
return tf.concat([outputs, inputs], 1) | |
class GenActivation(Layer): | |
def __init__(self, input_dim, output_dim, transformer_info, tau): | |
super(GenActivation, self).__init__() | |
self._output_dim = output_dim | |
self._transformer_info = transformer_info | |
self._tau = tau | |
self._fc = Dense( | |
output_dim, input_dim=(input_dim,), | |
kernel_initializer=partial(init_bounded, dim=input_dim), | |
bias_initializer=partial(init_bounded, dim=input_dim)) | |
def call(self, inputs, **kwargs): | |
outputs = self._fc(inputs, **kwargs) | |
data_t = tf.zeros(tf.shape(outputs)) | |
for idx in self._transformer_info: | |
act = tf.where( | |
idx[2] == 0, | |
tf.math.tanh(outputs[:, idx[0]:idx[1]]), | |
self._gumbel_softmax(outputs[:, idx[0]:idx[1]], tau=self._tau)) | |
data_t = tf.concat([data_t[:, :idx[0]], act, data_t[:, idx[1]:]], 1) | |
return outputs, data_t | |
@tf.function(experimental_relax_shapes=True) | |
def _gumbel_softmax(self, logits, tau=1.0, hard=False, dim=-1): | |
gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1) | |
gumbels = gumbel_dist.sample(tf.shape(logits)) | |
gumbels = (logits + gumbels) / tau | |
output = tf.nn.softmax(gumbels, dim) | |
if hard: | |
index = tf.math.reduce_max(output, 1, keepdims=True) | |
output_hard = tf.cast(tf.equal(output, index), output.dtype) | |
output = tf.stop_gradient(output_hard - output) + output | |
return output | |
class Generator(Model): | |
def __init__(self, input_dim, gen_dims, data_dim, transformer_info, tau): | |
super(Generator, self).__init__() | |
self._input_dim = input_dim | |
self._model = list() | |
dim = input_dim | |
for layer_dim in list(gen_dims): | |
self._model += [ResidualLayer(dim, layer_dim)] | |
dim += layer_dim | |
self._model += [GenActivation(dim, data_dim, transformer_info, tau)] | |
def call(self, inputs, **kwargs): | |
outputs = inputs | |
for layer in self._model: | |
outputs = layer(outputs, **kwargs) | |
return outputs | |
class OHE(OneHotEncoder): | |
def __eq__(self, other): | |
try: | |
np.testing.assert_equal(self.__dict__, other.__dict__) | |
return True | |
except AssertionError: | |
return False | |
class BGM(BayesianGaussianMixture): | |
def __eq__(self, other): | |
try: | |
np.testing.assert_equal(self.__dict__, other.__dict__) | |
return True | |
except AssertionError: return False | |
def _initialize_parameters(self, X, random_state): | |
n_samples, _ = X.shape | |
if self.init_params == 'kmeans': | |
resp = np.zeros((n_samples, self.n_components)) | |
try: | |
from cuml.cluster import KMeans | |
label = KMeans(n_clusters=self.n_components, n_init=1).fit(X).labels_ | |
except: | |
label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_ | |
resp[np.arange(n_samples), label] = 1 | |
elif self.init_params == 'random': | |
resp = random_state.rand(n_samples, self.n_components) | |
resp /= resp.sum(axis=1)[:, np.newaxis] | |
else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) | |
self._initialize(X, resp) | |
class Sampler(object): | |
def __init__(self, data, output_info): | |
super(Sampler, self).__init__() | |
self.data = data | |
self.model = [] | |
self.n = len(data) | |
st = 0 | |
skip = False | |
for item in output_info: | |
if item[1] == 'tanh': | |
st += item[0] | |
skip = True | |
elif item[1] == 'softmax': | |
if skip: | |
skip = False | |
st += item[0] | |
continue | |
ed = st + item[0] | |
tmp = [] | |
for j in range(item[0]): | |
tmp.append(np.nonzero(data[:, st + j])[0]) | |
self.model.append(tmp) | |
st = ed | |
else: | |
assert 0 | |
assert st == data.shape[1] | |
def sample(self, n, col, opt): | |
if col is None: | |
idx = np.random.choice(np.arange(self.n), n) | |
return self.data[idx] | |
idx = [] | |
for c, o in zip(col, opt): | |
idx.append(np.random.choice(self.model[c][o])) | |
return self.data[idx] | |
class DataTransformer(object): | |
@classmethod | |
def from_dict(cls, in_dict): | |
new_instance = DataTransformer() | |
new_instance.__dict__ = in_dict | |
return new_instance | |
def __init__(self, n_clusters=10, epsilon=0.005, parallel_preprocess=None): | |
self._parallel_preprocess = parallel_preprocess | |
self._n_clusters = n_clusters | |
self._epsilon = epsilon | |
self._is_dataframe = None | |
self._meta = None | |
self._dtypes = None | |
self.output_info = None | |
self.output_dimensions = None | |
self.output_tensor = None | |
self.cond_tensor = None | |
def generate_tensors(self): | |
if self.output_info is None: raise AttributeError("Output info still not available") | |
output_info = [] | |
cond_info = [] | |
i = 0 | |
st_idx = 0 | |
st_c = 0 | |
for item in self.output_info: | |
ed_idx = st_idx + item[0] | |
if not item[2]: # Categorical: add conditions for all categ options (?) | |
ed_c = st_c + item[0] | |
cond_info.append(tf.constant([st_idx, ed_idx, st_c, ed_c, i], dtype=tf.int32)) | |
st_c = ed_c | |
i += 1 | |
output_info.append(tf.constant([st_idx, ed_idx, int(item[1] == 'softmax')], dtype=tf.int32)) | |
st_idx = ed_idx | |
self.output_tensor = output_info | |
self.cond_tensor = cond_info | |
@ignore_warnings(category=ConvergenceWarning) | |
def _fit_continuous(self, column, data): | |
vgm = BGM( | |
self._n_clusters, | |
weight_concentration_prior_type='dirichlet_process', | |
weight_concentration_prior=0.001, | |
n_init=1 | |
) | |
vgm.fit(data) | |
components = vgm.weights_ > self._epsilon | |
num_components = components.sum() | |
return { | |
'name': column, | |
'model': vgm, | |
'components': components, | |
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)], | |
'output_dimensions': 1 + num_components, | |
} | |
def _fit_discrete(self, column, data): | |
ohe = OHE(sparse=False) | |
ohe.fit(data) | |
categories = len(ohe.categories_[0]) | |
return { | |
'name': column, | |
'encoder': ohe, | |
'output_info': [(categories, 'softmax', 0)], | |
'output_dimensions': categories | |
} | |
def fit(self, data, discrete_columns=tuple()): | |
self.output_info = [] | |
self.output_dimensions = 0 | |
if not isinstance(data, pd.DataFrame): | |
self._is_dataframe = False | |
data = pd.DataFrame(data) | |
else: | |
self._is_dataframe = True | |
self._dtypes = data.infer_objects().dtypes | |
self._meta = [] | |
if self._parallel_preprocess: | |
args = [] | |
for column in data.columns: args.append((self._n_clusters, self._epsilon, data, column, discrete_columns)) | |
res = process_map(fit_component, args, max_workers=2) | |
for out_info, out_dim, mets in res: | |
self.output_info += out_info | |
self.output_dimensions += out_dim | |
self._meta.append(mets) | |
else: | |
for column in tqdm(data.columns, desc='fitting preprocessing', leave=False): | |
column_data = data[[column]].values | |
if column in discrete_columns: | |
meta = self._fit_discrete(column, column_data) | |
else: | |
meta = self._fit_continuous(column, column_data) | |
self.output_info += meta['output_info'] | |
self.output_dimensions += meta['output_dimensions'] | |
self._meta.append(meta) | |
def _transform_continuous(self, column_meta, data): | |
components = column_meta['components'] | |
model = column_meta['model'] | |
means = model.means_.reshape((1, self._n_clusters)) | |
stds = np.sqrt(model.covariances_).reshape((1, self._n_clusters)) | |
features = (data - means) / (4 * stds) | |
probs = model.predict_proba(data) | |
n_opts = components.sum() | |
features = features[:, components] | |
probs = probs[:, components] | |
opt_sel = np.zeros(len(data), dtype='int') | |
for i in range(len(data)): | |
norm_probs = probs[i] + 1e-6 | |
norm_probs = norm_probs / norm_probs.sum() | |
opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs) | |
idx = np.arange((len(features))) | |
features = features[idx, opt_sel].reshape([-1, 1]) | |
features = np.clip(features, -.99, .99) | |
probs_onehot = np.zeros_like(probs) | |
probs_onehot[np.arange(len(probs)), opt_sel] = 1 | |
return [features, probs_onehot] | |
def _transform_discrete(self, column_meta, data): | |
encoder = column_meta['encoder'] | |
return encoder.transform(data) | |
def transform(self, data): | |
if not isinstance(data, pd.DataFrame): | |
data = pd.DataFrame(data) | |
values = [] | |
args = [] | |
for meta in tqdm(self._meta): | |
column_data = data[[meta['name']]].values | |
args.append((self, meta, column_data)) | |
rets = process_map(_transform, args) | |
for ret in rets: | |
values += ret | |
# for meta in tqdm(self._meta): | |
# column_data = data[[meta['name']]].values | |
# if 'model' in meta: | |
# values += self._transform_continuous(meta, column_data) | |
# else: | |
# values.append(self._transform_discrete(meta, column_data)) | |
return np.concatenate(values, axis=1).astype(float) | |
def _inverse_transform_continuous(self, meta, data, sigma=None): | |
model = meta['model'] | |
components = meta['components'] | |
mean = data[:, 0] | |
variance = data[:, 1:] | |
if sigma is not None: | |
mean = np.random.normal(mean, sigma) | |
mean = np.clip(mean, -1, 1) | |
v_t = np.ones((len(data), self._n_clusters)) * -100 | |
v_t[:, components] = variance | |
variance = v_t | |
means = model.means_.reshape([-1]) | |
stds = np.sqrt(model.covariances_).reshape([-1]) | |
p_argmax = np.argmax(variance, axis=1) | |
std_t = stds[p_argmax] | |
mean_t = means[p_argmax] | |
column = mean * 4 * std_t + mean_t | |
return column | |
def _inverse_transform_discrete(self, meta, data): | |
encoder = meta['encoder'] | |
return encoder.inverse_transform(data) | |
def inverse_transform(self, data, sigmas=None): | |
start = 0 | |
output = [] | |
column_names = [] | |
for meta in self._meta: | |
dimensions = meta['output_dimensions'] | |
columns_data = data[:, start:start + dimensions] | |
if 'model' in meta: | |
sigma = sigmas[start] if sigmas else None | |
inverted = self._inverse_transform_continuous( | |
meta, columns_data, sigma) | |
else: | |
inverted = self._inverse_transform_discrete(meta, columns_data) | |
output.append(inverted) | |
column_names.append(meta['name']) | |
start += dimensions | |
output = np.column_stack(output) | |
output = pd.DataFrame(output, columns=column_names)\ | |
.astype(self._dtypes) | |
if not self._is_dataframe: | |
output = output.values | |
return output | |
def _fit_discrete(column, data): | |
ohe = OHE(sparse=False) | |
ohe.fit(data) | |
categories = len(ohe.categories_[0]) | |
return { | |
'name': column, | |
'encoder': ohe, | |
'output_info': [(categories, 'softmax', 0)], | |
'output_dimensions': categories | |
} | |
def _fit_continuous(_n_clusters, _epsilon, column, data): | |
vgm = BGM( | |
_n_clusters, | |
weight_concentration_prior_type='dirichlet_process', | |
weight_concentration_prior=0.001, | |
n_init=1 | |
) | |
vgm.fit(data) | |
components = vgm.weights_ > _epsilon | |
num_components = components.sum() | |
return { | |
'name': column, | |
'model': vgm, | |
'components': components, | |
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)], | |
'output_dimensions': 1 + num_components, | |
} | |
def fit_component(args): | |
_n_clusters, _epsilon, data, column, discrete_columns = args | |
column_data = data[[column]].values | |
if column in discrete_columns: | |
meta = _fit_discrete(column, column_data) | |
else: | |
meta = _fit_continuous(_n_clusters, _epsilon, column, column_data) | |
return meta['output_info'], meta['output_dimensions'], meta | |
def _transform(args): | |
self, meta, column_data = args | |
if 'model' in meta: | |
return self._transform_continuous(meta, column_data) | |
else: | |
return([self._transform_discrete(meta, column_data)]) | |
class TabularGAN(object): | |
def __init__(self, file_path=None, log_dir=None, z_dim=128, pac=10, gen_dim=(256, 256), crt_dim=(256, 256), l2_scale=1e-6, batch_size=500, gp_lambda=10.0, tau=0.2, parallel=False, transformer=None): | |
if file_path is not None: self.load(file_path) | |
else: | |
if log_dir is not None and os.path.exists(log_dir): raise IsADirectoryError("Log directory does not exist.") | |
if batch_size % 2 != 0 or batch_size % pac != 0: raise ValueError("batch_size needs to be an even value divisible by pac.") | |
self._parallel_preprocess = parallel | |
self._log_dir = log_dir | |
self._z_dim = z_dim | |
self._pac = pac | |
self._pac_dim = None | |
self._l2_scale = l2_scale | |
self._batch_size = batch_size | |
self._gp_lambda = gp_lambda | |
self._tau = tau | |
self._gen_dim = tuple(gen_dim) | |
self._crt_dim = tuple(crt_dim) | |
self._g_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08) | |
self._c_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08) | |
self._transformer = DataTransformer(parallel_preprocess = self._parallel_preprocess) if transformer is None else transformer | |
self._need_transform = transformer is None | |
self._data_sampler = None | |
self._cond_generator = None | |
self._generator = None | |
self._critic = None | |
self.fitted = False | |
def fit(self, train_data, categorical_cols=tuple(), epochs=300, log_frequency=True): | |
if categorical_cols is None: categorical_cols = [i for i in list(set(train_data.columns) - set(train_data._get_numeric_data().columns))] | |
if self._need_transform: self._transformer.fit(train_data, categorical_cols) | |
# pkl_dump(self._transformer, 'ctgan_dump') | |
train_data = self._transformer.transform(train_data) | |
self._transformer.generate_tensors() | |
self._data_sampler = DataSampler(train_data, self._transformer.output_info) | |
data_dim = self._transformer.output_dimensions | |
self._cond_generator = ConditionalGenerator(train_data, self._transformer.output_info, log_frequency) | |
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, data_dim, self._transformer.output_tensor, self._tau) | |
self._critic = Critic( data_dim + self._cond_generator.n_opt, self._crt_dim, self._pac) | |
metrics = { 'g_loss': tf.metrics.Mean(), 'cond_loss': tf.metrics.Mean(), 'c_loss': tf.metrics.Mean(), 'gp': tf.metrics.Mean() } | |
if self._log_dir is not None: | |
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") | |
train_log_dir = self._log_dir + '/gradient_tape/' + current_time + '/train' | |
train_summary_writer = tf.summary.create_file_writer(train_log_dir) | |
self._generator.build((self._batch_size, self._generator._input_dim)) | |
self._critic.build((self._batch_size, self._critic._input_dim)) | |
steps_per_epoch = max(len(train_data) // self._batch_size, 1) | |
for epoch in range(epochs): | |
p_bar = ProgressBar(len(train_data), self._batch_size, epoch, epochs, metrics) | |
for _ in range(steps_per_epoch): | |
c_loss, g_p = self._train_c() | |
metrics['c_loss'](c_loss) | |
metrics['gp'](g_p) | |
g_loss, cond_loss = self._train_g() | |
metrics['g_loss'](g_loss) | |
metrics['cond_loss'](cond_loss) | |
p_bar._update(metrics) | |
if self._log_dir is not None: | |
with train_summary_writer.as_default(): | |
for met in metrics: | |
tf.summary.scalar(met, metrics[met].result(), step=epoch) | |
metrics[met].reset_states() | |
# p_bar.close() | |
del p_bar | |
self.fitted = True | |
@tf.function | |
def train_c_step(self, fake_cat, real_cat): | |
with tf.GradientTape() as tape: | |
y_fake = self._critic(fake_cat, training=True) | |
y_real = self._critic(real_cat, training=True) | |
g_p = gradient_penalty( partial(self._critic, training=True), real_cat, fake_cat, self._pac, self._gp_lambda) | |
loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake)) | |
c_loss = loss + g_p | |
grad = tape.gradient(c_loss, self._critic.trainable_variables) | |
self._c_opt.apply_gradients( zip(grad, self._critic.trainable_variables)) | |
return loss, g_p | |
def _train_c(self): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample(self._batch_size) | |
if cond_vec is None: | |
_, _, col_idx, opt_idx = None, None, None, None | |
real = self._data_sampler.sample(self._batch_size, col_idx, opt_idx) | |
else: | |
cond, _, col_idx, opt_idx = cond_vec | |
cond = tf.convert_to_tensor(cond) | |
fake_z = tf.concat([fake_z, cond], 1) | |
perm = np.arange(self._batch_size) | |
np.random.shuffle(perm) | |
real = self._data_sampler.sample(self._batch_size, col_idx[perm], opt_idx[perm]) | |
cond_perm = tf.gather(cond, perm) | |
fake, fake_act = self._generator(fake_z, training=True) | |
real = tf.convert_to_tensor(real.astype('float32')) | |
if cond_vec is not None: | |
fake_cat = tf.concat([fake_act, cond], 1) | |
real_cat = tf.concat([real, cond_perm], 1) | |
else: | |
fake_cat = fake | |
real_cat = real | |
return self.train_c_step(fake_cat, real_cat) | |
@tf.function | |
def train_g_step(self, fake_z): | |
with tf.GradientTape() as tape: | |
_, fake_act = self._generator(fake_z, training=True) | |
y_fake = self._critic(fake_act, training=True) | |
g_loss = -tf.reduce_mean(y_fake) | |
weights = self._generator.trainable_variables | |
grad = tape.gradient(g_loss, weights) | |
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))] | |
self._g_opt.apply_gradients( zip(grad, self._generator.trainable_variables)) | |
return g_loss, tf.constant(0, dtype=tf.float32) | |
@tf.function | |
def train_g_cond_step(self, fake_z, cond, mask, cond_info): | |
with tf.GradientTape() as tape: | |
fake, fake_act = self._generator(fake_z, training=True) | |
y_fake = self._critic(tf.concat([fake_act, cond], 1), training=True) | |
cond_loss = conditional_loss(cond_info, fake, cond, mask) | |
g_loss = -tf.reduce_mean(y_fake) + cond_loss | |
weights = self._generator.trainable_variables | |
grad = tape.gradient(g_loss, weights) | |
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))] | |
self._g_opt.apply_gradients(zip(grad, self._generator.trainable_variables)) | |
return g_loss, cond_loss | |
def _train_g(self): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample(self._batch_size) | |
if cond_vec is None: return self.train_g_step(fake_z) | |
cond, mask, _, _ = cond_vec | |
cond = tf.convert_to_tensor(cond, name="c1") | |
mask = tf.convert_to_tensor(mask, name="m1") | |
fake_z = tf.concat([fake_z, cond], 1, name="fake_z") | |
return self.train_g_cond_step(fake_z, cond, mask, self._transformer.cond_tensor) | |
def predict(self, n_samples=1000): | |
if n_samples <= 0: raise ValueError("Invalid number of samples.") | |
steps = n_samples // self._batch_size + 1 | |
data = [] | |
for _ in tf.range(steps): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample_zero(self._batch_size) | |
if cond_vec is not None: | |
cond = tf.constant(cond_vec) | |
fake_z = tf.concat([fake_z, cond], 1) | |
fake = self._generator(fake_z)[1] | |
data.append(fake.numpy()) | |
data = np.concatenate(data, 0) | |
data = data[:n_samples] | |
return self._transformer.inverse_transform(data, None) | |
def save(self, file_path, overwrite=False): | |
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.") | |
dir_name = os.path.dirname(file_path) | |
if len(dir_name) and not os.path.exists(os.path.dirname(file_path)): raise NotADirectoryError("The file directory does not exist.") | |
if not overwrite and os.path.exists(file_path): raise FileExistsError( "File already exists. If you wish to replace it, use overwrite=True") | |
class_dict = {k: v for k, v in self.__dict__.items() if type(v) in [int, float, tuple]} | |
class_dict['_cond_generator'] = self._cond_generator.__dict__ | |
class_dict['_transformer'] = self._transformer.__dict__ | |
class_dict['_gen_weights'] = self._generator.get_weights() | |
joblib.dump(class_dict, file_path) | |
del class_dict | |
def load(self, file_path): | |
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.") | |
if not os.path.exists(file_path): raise FileNotFoundError("The provided file_path does not exist.") | |
class_dict = joblib.load(file_path) | |
if class_dict is None: raise AttributeError | |
for key, value in class_dict.items(): | |
if type(value) in [int, float, tuple]: setattr(self, key, value) | |
self._transformer = DataTransformer.from_dict(class_dict['_transformer']) | |
self._cond_generator = ConditionalGenerator.from_dict(class_dict['_cond_generator']) | |
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, self._transformer.output_dimensions, self._transformer.output_tensor, self._tau) | |
self._generator.build((self._batch_size, self._generator._input_dim)) | |
self._generator.set_weights(class_dict['_gen_weights']) | |
def transform(self, X, augmentation_proba=0.2, concat=False): | |
X_aug = X.copy() | |
if not self.fitted: self.fit(X_aug) | |
auged_data = self.predict(n_samples = int(len(X_aug) * augmentation_proba)) | |
aug_indices = np.random.choice( list(range(len(X_aug))), int(len(X_aug) * (1 - augmentation_proba)) ) | |
if not concat: X_aug[aug_indices] = auged_data | |
else: X_aug = pd.concat([X_aug, auged_data]) | |
return X_aug | |
def augment(self, X, augmentation_proba = 0.2): return self.transform(X, augmentation_proba = 0.2) | |
class DataSampler(object): | |
def __init__(self, data, output_info): | |
super(DataSampler, self).__init__() | |
self._data = data | |
self._model = [] | |
self._n = len(data) | |
st_idx = 0 | |
skip = False | |
for item in output_info: | |
if item[1] == 'tanh': | |
st_idx += item[0] | |
skip = True | |
elif item[1] == 'softmax': | |
if skip: | |
skip = False | |
st_idx += item[0] | |
continue | |
ed_idx = st_idx + item[0] | |
tmp = [] | |
for j in range(item[0]): | |
tmp.append(np.nonzero(data[:, st_idx + j])[0]) | |
self._model.append(tmp) | |
st_idx = ed_idx | |
else: | |
assert 0 | |
assert st_idx == data.shape[1] | |
def sample(self, n_samples, col_idx, opt_idx): | |
if col_idx is None: | |
idx = np.random.choice(np.arange(self._n), n_samples) | |
return self._data[idx] | |
idx = [] | |
for col, opt in zip(col_idx, opt_idx): | |
idx.append(np.random.choice(self._model[col][opt])) | |
return self._data[idx] | |
def init_bounded(shape, **kwargs): | |
if 'dim' not in kwargs: raise AttributeError('dim not passed as input') | |
if 'dtype' not in kwargs: raise AttributeError('dtype not passed as input') | |
dim = kwargs['dim'] | |
d_type = kwargs['dtype'] | |
bound = 1 / math.sqrt(dim) | |
return tf.random.uniform(shape, minval=-bound, maxval=bound, dtype=d_type) | |
class Critic(Model): | |
def __init__(self, input_dim, dis_dims, pac): | |
super(Critic, self).__init__() | |
self._pac = pac | |
self._input_dim = input_dim | |
self._model = [self._reshape_func] | |
dim = input_dim * self._pac | |
for layer_dim in list(dis_dims): | |
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim)), LeakyReLU(0.2), Dropout(0.5)] | |
dim = layer_dim | |
layer_dim = 1 | |
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim))] | |
def _reshape_func(self, inputs, **kwargs): | |
dims = inputs.get_shape().as_list() | |
return tf.reshape(inputs, [-1, dims[1] * self._pac]) | |
def call(self, inputs, **kwargs): | |
outputs = inputs | |
for layer in self._model: | |
outputs = layer(outputs, **kwargs) | |
return outputs | |
import keras | |
class ProgressBar(keras.utils.generic_utils.Progbar): | |
@classmethod | |
def _get_terminal_width(cls): | |
width = shutil.get_terminal_size(fallback=(200, 24))[0] | |
return width if width != 0 else 120 | |
def __init__(self, total_samples, batch_size, epoch, num_epochs, metrics): | |
self._seen = 0 | |
self.log_values = [] | |
postfix = {m: f'{0:6.3f}' for m in metrics} | |
postfix[1] = 1 | |
str_format = '{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' \ | |
'ETA: {remaining} Elapsed Time: {elapsed} ' + \ | |
reduce(lambda x, y: x + y, | |
["%s:{postfix[%s]} " % (m, m) for m in metrics], | |
"") | |
super(ProgressBar, self).__init__( | |
target=(total_samples // batch_size) * batch_size) | |
self._batch_size = batch_size | |
def _update(self, metrics): | |
new_metrics = {} | |
for met in metrics: new_metrics[met] = metrics[met].result() | |
for k in new_metrics: | |
self.log_values.append((k, new_metrics[k])) | |
self._seen += self._batch_size | |
super(ProgressBar, self).update(self._seen, self.log_values) | |
def gradient_penalty(func, real, fake, pac=10, gp_lambda=10.0): | |
alpha = tf.random.uniform([real.shape[0] // pac, 1, 1], 0., 1.) | |
alpha = tf.tile(alpha, tf.constant([1, pac, real.shape[1]], tf.int32)) | |
alpha = tf.reshape(alpha, [-1, real.shape[1]]) | |
interpolates = alpha * real + ((1 - alpha) * fake) | |
with tf.GradientTape() as tape: | |
tape.watch(interpolates) | |
pred = func(interpolates) | |
grad = tape.gradient(pred, [interpolates])[0] | |
grad = tf.reshape(grad, tf.constant([-1, pac * real.shape[1]], tf.int32)) | |
slopes = tf.math.reduce_euclidean_norm(grad, axis=1) | |
return tf.reduce_mean((slopes - 1.) ** 2) * gp_lambda | |
def conditional_loss(cond_info, data, cond, mask): | |
shape = tf.shape(mask) | |
c_loss = tf.zeros(shape) | |
for item in cond_info: | |
data_logsoftmax = data[:, item[0]:item[1]] | |
cond_vec = tf.math.argmax(cond[:, item[2]:item[3]], 1) | |
loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits(cond_vec, data_logsoftmax), [-1, 1]) | |
c_loss = tf.concat([c_loss[:, :item[-1]], loss, c_loss[:, item[-1]+1:]], 1) | |
return tf.reduce_sum(c_loss * mask) / tf.cast(shape[0], dtype=tf.float32) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import math | |
import pickle | |
import shutil | |
import joblib | |
import warnings | |
import datetime | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from sklearn import cluster | |
from functools import reduce | |
from functools import partial | |
from joblib import dump, load | |
warnings.filterwarnings("ignore") | |
from tqdm.autonotebook import tqdm | |
import tensorflow_probability as tfp | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.exceptions import ConvergenceWarning | |
from sklearn.utils._testing import ignore_warnings | |
from sklearn.mixture import BayesianGaussianMixture | |
from tqdm.contrib.concurrent import process_map, thread_map | |
from tensorflow.keras.layers import * | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.optimizers import Adam | |
def pkl_dump(obj, path): pickle.dump(obj, open(path, 'wb'), protocol=4) | |
def pkl_load(path): return pickle.load(open(path, 'rb')) | |
class ConditionalGenerator: | |
@classmethod | |
def from_dict(cls, in_dict): | |
new_instance = ConditionalGenerator() | |
new_instance.__dict__ = in_dict | |
return new_instance | |
def __init__(self, data=None, output_info=None, log_frequency=None): | |
if data is None or output_info is None or log_frequency is None: | |
return | |
self._model = [] | |
start = 0 | |
skip = False | |
max_interval = 0 | |
counter = 0 | |
for item in output_info: | |
if item[1] == 'tanh': | |
start += item[0] | |
skip = True | |
continue | |
if item[1] == 'softmax': | |
if skip: | |
skip = False | |
start += item[0] | |
continue | |
end = start + item[0] | |
max_interval = max(max_interval, end - start) | |
counter += 1 | |
self._model.append(np.argmax(data[:, start:end], axis=-1)) | |
start = end | |
else: | |
assert 0 | |
assert start == data.shape[1] | |
self._interval = [] | |
self._n_col = 0 | |
self.n_opt = 0 | |
skip = False | |
start = 0 | |
self._p = np.zeros((counter, max_interval)) | |
for item in output_info: | |
if item[1] == 'tanh': | |
skip = True | |
start += item[0] | |
continue | |
if item[1] == 'softmax': | |
if skip: | |
start += item[0] | |
skip = False | |
continue | |
end = start + item[0] | |
tmp = np.sum(data[:, start:end], axis=0) | |
if log_frequency: | |
tmp = np.log(tmp + 1) | |
tmp = tmp / np.sum(tmp) | |
self._p[self._n_col, :item[0]] = tmp | |
self._interval.append((self.n_opt, item[0])) | |
self.n_opt += item[0] | |
self._n_col += 1 | |
start = end | |
else: | |
assert 0 | |
self._interval = np.asarray(self._interval) | |
def _random_choice_prob_index(self, idx): | |
prob = self._p[idx] | |
rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1) | |
return (prob.cumsum(axis=1) > rand).argmax(axis=1) | |
def sample(self, batch_size): | |
if self._n_col == 0: return None | |
col_idx = np.random.choice(np.arange(self._n_col), batch_size) | |
cond = np.zeros((batch_size, self.n_opt), dtype='float32') | |
mask = np.zeros((batch_size, self._n_col), dtype='float32') | |
mask[np.arange(batch_size), col_idx] = 1 | |
opt_idx = self._random_choice_prob_index(col_idx) | |
opt = self._interval[col_idx, 0] + opt_idx | |
cond[np.arange(batch_size), opt] = 1 | |
return cond, mask, col_idx, opt_idx | |
def sample_zero(self, batch_size): | |
if self._n_col == 0: return None | |
vec = np.zeros((batch_size, self.n_opt), dtype='float32') | |
idx = np.random.choice(np.arange(self._n_col), batch_size) | |
for i in range(batch_size): | |
col = idx[i] | |
pick = int(np.random.choice(self._model[col])) | |
vec[i, pick + self._interval[col, 0]] = 1 | |
return vec | |
class ResidualLayer(Layer): | |
def __init__(self, input_dim, output_dim): | |
super(ResidualLayer, self).__init__() | |
self._output_dim = output_dim | |
self._fc = Dense( self._output_dim, input_dim=(input_dim,), kernel_initializer=partial(init_bounded, dim=input_dim), bias_initializer=partial(init_bounded, dim=input_dim)) | |
self._bn = BatchNormalization(epsilon=1e-5, momentum=0.9) | |
self._relu = ReLU() | |
def call(self, inputs, **kwargs): | |
outputs = self._fc(inputs, **kwargs) | |
outputs = self._bn(outputs, **kwargs) | |
outputs = self._relu(outputs, **kwargs) | |
return tf.concat([outputs, inputs], 1) | |
class GenActivation(Layer): | |
def __init__(self, input_dim, output_dim, transformer_info, tau): | |
super(GenActivation, self).__init__() | |
self._output_dim = output_dim | |
self._transformer_info = transformer_info | |
self._tau = tau | |
self._fc = Dense( | |
output_dim, input_dim=(input_dim,), | |
kernel_initializer=partial(init_bounded, dim=input_dim), | |
bias_initializer=partial(init_bounded, dim=input_dim)) | |
def call(self, inputs, **kwargs): | |
outputs = self._fc(inputs, **kwargs) | |
data_t = tf.zeros(tf.shape(outputs)) | |
for idx in self._transformer_info: | |
act = tf.where( | |
idx[2] == 0, | |
tf.math.tanh(outputs[:, idx[0]:idx[1]]), | |
self._gumbel_softmax(outputs[:, idx[0]:idx[1]], tau=self._tau)) | |
data_t = tf.concat([data_t[:, :idx[0]], act, data_t[:, idx[1]:]], 1) | |
return outputs, data_t | |
@tf.function(experimental_relax_shapes=True) | |
def _gumbel_softmax(self, logits, tau=1.0, hard=False, dim=-1): | |
gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1) | |
gumbels = gumbel_dist.sample(tf.shape(logits)) | |
gumbels = (logits + gumbels) / tau | |
output = tf.nn.softmax(gumbels, dim) | |
if hard: | |
index = tf.math.reduce_max(output, 1, keepdims=True) | |
output_hard = tf.cast(tf.equal(output, index), output.dtype) | |
output = tf.stop_gradient(output_hard - output) + output | |
return output | |
class Generator(Model): | |
def __init__(self, input_dim, gen_dims, data_dim, transformer_info, tau): | |
super(Generator, self).__init__() | |
self._input_dim = input_dim | |
self._model = list() | |
dim = input_dim | |
for layer_dim in list(gen_dims): | |
self._model += [ResidualLayer(dim, layer_dim)] | |
dim += layer_dim | |
self._model += [GenActivation(dim, data_dim, transformer_info, tau)] | |
def call(self, inputs, **kwargs): | |
outputs = inputs | |
for layer in self._model: | |
outputs = layer(outputs, **kwargs) | |
return outputs | |
class OHE(OneHotEncoder): | |
def __eq__(self, other): | |
try: | |
np.testing.assert_equal(self.__dict__, other.__dict__) | |
return True | |
except AssertionError: | |
return False | |
class BGM(BayesianGaussianMixture): | |
def __eq__(self, other): | |
try: | |
np.testing.assert_equal(self.__dict__, other.__dict__) | |
return True | |
except AssertionError: return False | |
def _initialize_parameters(self, X, random_state): | |
n_samples, _ = X.shape | |
if self.init_params == 'kmeans': | |
resp = np.zeros((n_samples, self.n_components)) | |
try: | |
from cuml.cluster import KMeans | |
label = KMeans(n_clusters=self.n_components, n_init=1).fit(X).labels_ | |
except: | |
label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_ | |
resp[np.arange(n_samples), label] = 1 | |
elif self.init_params == 'random': | |
resp = random_state.rand(n_samples, self.n_components) | |
resp /= resp.sum(axis=1)[:, np.newaxis] | |
else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) | |
self._initialize(X, resp) | |
class Sampler(object): | |
def __init__(self, data, output_info): | |
super(Sampler, self).__init__() | |
self.data = data | |
self.model = [] | |
self.n = len(data) | |
st = 0 | |
skip = False | |
for item in output_info: | |
if item[1] == 'tanh': | |
st += item[0] | |
skip = True | |
elif item[1] == 'softmax': | |
if skip: | |
skip = False | |
st += item[0] | |
continue | |
ed = st + item[0] | |
tmp = [] | |
for j in range(item[0]): | |
tmp.append(np.nonzero(data[:, st + j])[0]) | |
self.model.append(tmp) | |
st = ed | |
else: | |
assert 0 | |
assert st == data.shape[1] | |
def sample(self, n, col, opt): | |
if col is None: | |
idx = np.random.choice(np.arange(self.n), n) | |
return self.data[idx] | |
idx = [] | |
for c, o in zip(col, opt): | |
idx.append(np.random.choice(self.model[c][o])) | |
return self.data[idx] | |
class DataTransformer(object): | |
@classmethod | |
def from_dict(cls, in_dict): | |
new_instance = DataTransformer() | |
new_instance.__dict__ = in_dict | |
return new_instance | |
def __init__(self, n_clusters=10, epsilon=0.005, parallel_preprocess=None): | |
self._parallel_preprocess = parallel_preprocess | |
self._n_clusters = n_clusters | |
self._epsilon = epsilon | |
self._is_dataframe = None | |
self._meta = None | |
self._dtypes = None | |
self.output_info = None | |
self.output_dimensions = None | |
self.output_tensor = None | |
self.cond_tensor = None | |
def generate_tensors(self): | |
if self.output_info is None: raise AttributeError("Output info still not available") | |
output_info = [] | |
cond_info = [] | |
i = 0 | |
st_idx = 0 | |
st_c = 0 | |
for item in self.output_info: | |
ed_idx = st_idx + item[0] | |
if not item[2]: # Categorical: add conditions for all categ options (?) | |
ed_c = st_c + item[0] | |
cond_info.append(tf.constant([st_idx, ed_idx, st_c, ed_c, i], dtype=tf.int32)) | |
st_c = ed_c | |
i += 1 | |
output_info.append(tf.constant([st_idx, ed_idx, int(item[1] == 'softmax')], dtype=tf.int32)) | |
st_idx = ed_idx | |
self.output_tensor = output_info | |
self.cond_tensor = cond_info | |
@ignore_warnings(category=ConvergenceWarning) | |
def _fit_continuous(self, column, data): | |
vgm = BGM( | |
self._n_clusters, | |
weight_concentration_prior_type='dirichlet_process', | |
weight_concentration_prior=0.001, | |
n_init=1 | |
) | |
vgm.fit(data) | |
components = vgm.weights_ > self._epsilon | |
num_components = components.sum() | |
return { | |
'name': column, | |
'model': vgm, | |
'components': components, | |
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)], | |
'output_dimensions': 1 + num_components, | |
} | |
def _fit_discrete(self, column, data): | |
ohe = OHE(sparse=False) | |
ohe.fit(data) | |
categories = len(ohe.categories_[0]) | |
return { | |
'name': column, | |
'encoder': ohe, | |
'output_info': [(categories, 'softmax', 0)], | |
'output_dimensions': categories | |
} | |
def fit(self, data, discrete_columns=tuple()): | |
self.output_info = [] | |
self.output_dimensions = 0 | |
if not isinstance(data, pd.DataFrame): | |
self._is_dataframe = False | |
data = pd.DataFrame(data) | |
else: | |
self._is_dataframe = True | |
self._dtypes = data.infer_objects().dtypes | |
self._meta = [] | |
if self._parallel_preprocess: | |
args = [] | |
for column in data.columns: args.append((self._n_clusters, self._epsilon, data, column, discrete_columns)) | |
res = process_map(fit_component, args, max_workers=2) | |
for out_info, out_dim, mets in res: | |
self.output_info += out_info | |
self.output_dimensions += out_dim | |
self._meta.append(mets) | |
else: | |
for column in tqdm(data.columns, desc='fitting preprocessing', leave=False): | |
column_data = data[[column]].values | |
if column in discrete_columns: | |
meta = self._fit_discrete(column, column_data) | |
else: | |
meta = self._fit_continuous(column, column_data) | |
self.output_info += meta['output_info'] | |
self.output_dimensions += meta['output_dimensions'] | |
self._meta.append(meta) | |
def _transform_continuous(self, column_meta, data): | |
components = column_meta['components'] | |
model = column_meta['model'] | |
means = model.means_.reshape((1, self._n_clusters)) | |
stds = np.sqrt(model.covariances_).reshape((1, self._n_clusters)) | |
features = (data - means) / (4 * stds) | |
probs = model.predict_proba(data) | |
n_opts = components.sum() | |
features = features[:, components] | |
probs = probs[:, components] | |
opt_sel = np.zeros(len(data), dtype='int') | |
for i in range(len(data)): | |
norm_probs = probs[i] + 1e-6 | |
norm_probs = norm_probs / norm_probs.sum() | |
opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs) | |
idx = np.arange((len(features))) | |
features = features[idx, opt_sel].reshape([-1, 1]) | |
features = np.clip(features, -.99, .99) | |
probs_onehot = np.zeros_like(probs) | |
probs_onehot[np.arange(len(probs)), opt_sel] = 1 | |
return [features, probs_onehot] | |
def _transform_discrete(self, column_meta, data): | |
encoder = column_meta['encoder'] | |
return encoder.transform(data) | |
def transform(self, data): | |
if not isinstance(data, pd.DataFrame): | |
data = pd.DataFrame(data) | |
values = [] | |
args = [] | |
for meta in tqdm(self._meta): | |
column_data = data[[meta['name']]].values | |
args.append((self, meta, column_data)) | |
rets = process_map(_transform, args) | |
for ret in rets: | |
values += ret | |
# for meta in tqdm(self._meta): | |
# column_data = data[[meta['name']]].values | |
# if 'model' in meta: | |
# values += self._transform_continuous(meta, column_data) | |
# else: | |
# values.append(self._transform_discrete(meta, column_data)) | |
return np.concatenate(values, axis=1).astype(float) | |
def _inverse_transform_continuous(self, meta, data, sigma=None): | |
model = meta['model'] | |
components = meta['components'] | |
mean = data[:, 0] | |
variance = data[:, 1:] | |
if sigma is not None: | |
mean = np.random.normal(mean, sigma) | |
mean = np.clip(mean, -1, 1) | |
v_t = np.ones((len(data), self._n_clusters)) * -100 | |
v_t[:, components] = variance | |
variance = v_t | |
means = model.means_.reshape([-1]) | |
stds = np.sqrt(model.covariances_).reshape([-1]) | |
p_argmax = np.argmax(variance, axis=1) | |
std_t = stds[p_argmax] | |
mean_t = means[p_argmax] | |
column = mean * 4 * std_t + mean_t | |
return column | |
def _inverse_transform_discrete(self, meta, data): | |
encoder = meta['encoder'] | |
return encoder.inverse_transform(data) | |
def inverse_transform(self, data, sigmas=None): | |
start = 0 | |
output = [] | |
column_names = [] | |
for meta in self._meta: | |
dimensions = meta['output_dimensions'] | |
columns_data = data[:, start:start + dimensions] | |
if 'model' in meta: | |
sigma = sigmas[start] if sigmas else None | |
inverted = self._inverse_transform_continuous( | |
meta, columns_data, sigma) | |
else: | |
inverted = self._inverse_transform_discrete(meta, columns_data) | |
output.append(inverted) | |
column_names.append(meta['name']) | |
start += dimensions | |
output = np.column_stack(output) | |
output = pd.DataFrame(output, columns=column_names)\ | |
.astype(self._dtypes) | |
if not self._is_dataframe: | |
output = output.values | |
return output | |
def _fit_discrete(column, data): | |
ohe = OHE(sparse=False) | |
ohe.fit(data) | |
categories = len(ohe.categories_[0]) | |
return { | |
'name': column, | |
'encoder': ohe, | |
'output_info': [(categories, 'softmax', 0)], | |
'output_dimensions': categories | |
} | |
def _fit_continuous(_n_clusters, _epsilon, column, data): | |
vgm = BGM( | |
_n_clusters, | |
weight_concentration_prior_type='dirichlet_process', | |
weight_concentration_prior=0.001, | |
n_init=1 | |
) | |
vgm.fit(data) | |
components = vgm.weights_ > _epsilon | |
num_components = components.sum() | |
return { | |
'name': column, | |
'model': vgm, | |
'components': components, | |
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)], | |
'output_dimensions': 1 + num_components, | |
} | |
def fit_component(args): | |
_n_clusters, _epsilon, data, column, discrete_columns = args | |
column_data = data[[column]].values | |
if column in discrete_columns: | |
meta = _fit_discrete(column, column_data) | |
else: | |
meta = _fit_continuous(_n_clusters, _epsilon, column, column_data) | |
return meta['output_info'], meta['output_dimensions'], meta | |
def _transform(args): | |
self, meta, column_data = args | |
if 'model' in meta: | |
return self._transform_continuous(meta, column_data) | |
else: | |
return([self._transform_discrete(meta, column_data)]) | |
class TabularGAN(object): | |
def __init__(self, file_path=None, log_dir=None, z_dim=128, pac=10, gen_dim=(256, 256), crt_dim=(256, 256), l2_scale=1e-6, batch_size=500, gp_lambda=10.0, tau=0.2, parallel=False, transformer=None): | |
if file_path is not None: self.load(file_path) | |
else: | |
if log_dir is not None and os.path.exists(log_dir): raise IsADirectoryError("Log directory does not exist.") | |
if batch_size % 2 != 0 or batch_size % pac != 0: raise ValueError("batch_size needs to be an even value divisible by pac.") | |
self._parallel_preprocess = parallel | |
self._log_dir = log_dir | |
self._z_dim = z_dim | |
self._pac = pac | |
self._pac_dim = None | |
self._l2_scale = l2_scale | |
self._batch_size = batch_size | |
self._gp_lambda = gp_lambda | |
self._tau = tau | |
self._gen_dim = tuple(gen_dim) | |
self._crt_dim = tuple(crt_dim) | |
self._g_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08) | |
self._c_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08) | |
self._transformer = DataTransformer(parallel_preprocess = self._parallel_preprocess) if transformer is None else transformer | |
self._need_transform = transformer is None | |
self._data_sampler = None | |
self._cond_generator = None | |
self._generator = None | |
self._critic = None | |
self.fitted = False | |
def fit(self, train_data, categorical_cols=tuple(), epochs=300, log_frequency=True): | |
if categorical_cols is None: categorical_cols = [i for i in list(set(train_data.columns) - set(train_data._get_numeric_data().columns))] | |
if self._need_transform: self._transformer.fit(train_data, categorical_cols) | |
# pkl_dump(self._transformer, 'ctgan_dump') | |
train_data = self._transformer.transform(train_data) | |
self._transformer.generate_tensors() | |
self._data_sampler = DataSampler(train_data, self._transformer.output_info) | |
data_dim = self._transformer.output_dimensions | |
self._cond_generator = ConditionalGenerator(train_data, self._transformer.output_info, log_frequency) | |
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, data_dim, self._transformer.output_tensor, self._tau) | |
self._critic = Critic( data_dim + self._cond_generator.n_opt, self._crt_dim, self._pac) | |
metrics = { 'g_loss': tf.metrics.Mean(), 'cond_loss': tf.metrics.Mean(), 'c_loss': tf.metrics.Mean(), 'gp': tf.metrics.Mean() } | |
if self._log_dir is not None: | |
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") | |
train_log_dir = self._log_dir + '/gradient_tape/' + current_time + '/train' | |
train_summary_writer = tf.summary.create_file_writer(train_log_dir) | |
self._generator.build((self._batch_size, self._generator._input_dim)) | |
self._critic.build((self._batch_size, self._critic._input_dim)) | |
steps_per_epoch = max(len(train_data) // self._batch_size, 1) | |
for epoch in range(epochs): | |
p_bar = ProgressBar(len(train_data), self._batch_size, epoch, epochs, metrics) | |
for _ in range(steps_per_epoch): | |
c_loss, g_p = self._train_c() | |
metrics['c_loss'](c_loss) | |
metrics['gp'](g_p) | |
g_loss, cond_loss = self._train_g() | |
metrics['g_loss'](g_loss) | |
metrics['cond_loss'](cond_loss) | |
p_bar._update(metrics) | |
if self._log_dir is not None: | |
with train_summary_writer.as_default(): | |
for met in metrics: | |
tf.summary.scalar(met, metrics[met].result(), step=epoch) | |
metrics[met].reset_states() | |
# p_bar.close() | |
del p_bar | |
self.fitted = True | |
@tf.function | |
def train_c_step(self, fake_cat, real_cat): | |
with tf.GradientTape() as tape: | |
y_fake = self._critic(fake_cat, training=True) | |
y_real = self._critic(real_cat, training=True) | |
g_p = gradient_penalty( partial(self._critic, training=True), real_cat, fake_cat, self._pac, self._gp_lambda) | |
loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake)) | |
c_loss = loss + g_p | |
grad = tape.gradient(c_loss, self._critic.trainable_variables) | |
self._c_opt.apply_gradients( zip(grad, self._critic.trainable_variables)) | |
return loss, g_p | |
def _train_c(self): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample(self._batch_size) | |
if cond_vec is None: | |
_, _, col_idx, opt_idx = None, None, None, None | |
real = self._data_sampler.sample(self._batch_size, col_idx, opt_idx) | |
else: | |
cond, _, col_idx, opt_idx = cond_vec | |
cond = tf.convert_to_tensor(cond) | |
fake_z = tf.concat([fake_z, cond], 1) | |
perm = np.arange(self._batch_size) | |
np.random.shuffle(perm) | |
real = self._data_sampler.sample(self._batch_size, col_idx[perm], opt_idx[perm]) | |
cond_perm = tf.gather(cond, perm) | |
fake, fake_act = self._generator(fake_z, training=True) | |
real = tf.convert_to_tensor(real.astype('float32')) | |
if cond_vec is not None: | |
fake_cat = tf.concat([fake_act, cond], 1) | |
real_cat = tf.concat([real, cond_perm], 1) | |
else: | |
fake_cat = fake | |
real_cat = real | |
return self.train_c_step(fake_cat, real_cat) | |
@tf.function | |
def train_g_step(self, fake_z): | |
with tf.GradientTape() as tape: | |
_, fake_act = self._generator(fake_z, training=True) | |
y_fake = self._critic(fake_act, training=True) | |
g_loss = -tf.reduce_mean(y_fake) | |
weights = self._generator.trainable_variables | |
grad = tape.gradient(g_loss, weights) | |
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))] | |
self._g_opt.apply_gradients( zip(grad, self._generator.trainable_variables)) | |
return g_loss, tf.constant(0, dtype=tf.float32) | |
@tf.function | |
def train_g_cond_step(self, fake_z, cond, mask, cond_info): | |
with tf.GradientTape() as tape: | |
fake, fake_act = self._generator(fake_z, training=True) | |
y_fake = self._critic(tf.concat([fake_act, cond], 1), training=True) | |
cond_loss = conditional_loss(cond_info, fake, cond, mask) | |
g_loss = -tf.reduce_mean(y_fake) + cond_loss | |
weights = self._generator.trainable_variables | |
grad = tape.gradient(g_loss, weights) | |
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))] | |
self._g_opt.apply_gradients(zip(grad, self._generator.trainable_variables)) | |
return g_loss, cond_loss | |
def _train_g(self): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample(self._batch_size) | |
if cond_vec is None: return self.train_g_step(fake_z) | |
cond, mask, _, _ = cond_vec | |
cond = tf.convert_to_tensor(cond, name="c1") | |
mask = tf.convert_to_tensor(mask, name="m1") | |
fake_z = tf.concat([fake_z, cond], 1, name="fake_z") | |
return self.train_g_cond_step(fake_z, cond, mask, self._transformer.cond_tensor) | |
def predict(self, n_samples=1000): | |
if n_samples <= 0: raise ValueError("Invalid number of samples.") | |
steps = n_samples // self._batch_size + 1 | |
data = [] | |
for _ in tf.range(steps): | |
fake_z = tf.random.normal([self._batch_size, self._z_dim]) | |
cond_vec = self._cond_generator.sample_zero(self._batch_size) | |
if cond_vec is not None: | |
cond = tf.constant(cond_vec) | |
fake_z = tf.concat([fake_z, cond], 1) | |
fake = self._generator(fake_z)[1] | |
data.append(fake.numpy()) | |
data = np.concatenate(data, 0) | |
data = data[:n_samples] | |
return self._transformer.inverse_transform(data, None) | |
def save(self, file_path, overwrite=False): | |
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.") | |
dir_name = os.path.dirname(file_path) | |
if len(dir_name) and not os.path.exists(os.path.dirname(file_path)): raise NotADirectoryError("The file directory does not exist.") | |
if not overwrite and os.path.exists(file_path): raise FileExistsError( "File already exists. If you wish to replace it, use overwrite=True") | |
class_dict = {k: v for k, v in self.__dict__.items() if type(v) in [int, float, tuple]} | |
class_dict['_cond_generator'] = self._cond_generator.__dict__ | |
class_dict['_transformer'] = self._transformer.__dict__ | |
class_dict['_gen_weights'] = self._generator.get_weights() | |
joblib.dump(class_dict, file_path) | |
del class_dict | |
def load(self, file_path): | |
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.") | |
if not os.path.exists(file_path): raise FileNotFoundError("The provided file_path does not exist.") | |
class_dict = joblib.load(file_path) | |
if class_dict is None: raise AttributeError | |
for key, value in class_dict.items(): | |
if type(value) in [int, float, tuple]: setattr(self, key, value) | |
self._transformer = DataTransformer.from_dict(class_dict['_transformer']) | |
self._cond_generator = ConditionalGenerator.from_dict(class_dict['_cond_generator']) | |
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, self._transformer.output_dimensions, self._transformer.output_tensor, self._tau) | |
self._generator.build((self._batch_size, self._generator._input_dim)) | |
self._generator.set_weights(class_dict['_gen_weights']) | |
def transform(self, X, augmentation_proba=0.2, concat=False): | |
X_aug = X.copy() | |
if not self.fitted: self.fit(X_aug) | |
auged_data = self.predict(n_samples = int(len(X_aug) * augmentation_proba)) | |
aug_indices = np.random.choice( list(range(len(X_aug))), int(len(X_aug) * (1 - augmentation_proba)) ) | |
if not concat: X_aug[aug_indices] = auged_data | |
else: X_aug = pd.concat([X_aug, auged_data]) | |
return X_aug | |
def augment(self, X, augmentation_proba = 0.2): return self.transform(X, augmentation_proba = 0.2) | |
class DataSampler(object): | |
def __init__(self, data, output_info): | |
super(DataSampler, self).__init__() | |
self._data = data | |
self._model = [] | |
self._n = len(data) | |
st_idx = 0 | |
skip = False | |
for item in output_info: | |
if item[1] == 'tanh': | |
st_idx += item[0] | |
skip = True | |
elif item[1] == 'softmax': | |
if skip: | |
skip = False | |
st_idx += item[0] | |
continue | |
ed_idx = st_idx + item[0] | |
tmp = [] | |
for j in range(item[0]): | |
tmp.append(np.nonzero(data[:, st_idx + j])[0]) | |
self._model.append(tmp) | |
st_idx = ed_idx | |
else: | |
assert 0 | |
assert st_idx == data.shape[1] | |
def sample(self, n_samples, col_idx, opt_idx): | |
if col_idx is None: | |
idx = np.random.choice(np.arange(self._n), n_samples) | |
return self._data[idx] | |
idx = [] | |
for col, opt in zip(col_idx, opt_idx): | |
idx.append(np.random.choice(self._model[col][opt])) | |
return self._data[idx] | |
def init_bounded(shape, **kwargs): | |
if 'dim' not in kwargs: raise AttributeError('dim not passed as input') | |
if 'dtype' not in kwargs: raise AttributeError('dtype not passed as input') | |
dim = kwargs['dim'] | |
d_type = kwargs['dtype'] | |
bound = 1 / math.sqrt(dim) | |
return tf.random.uniform(shape, minval=-bound, maxval=bound, dtype=d_type) | |
class Critic(Model): | |
def __init__(self, input_dim, dis_dims, pac): | |
super(Critic, self).__init__() | |
self._pac = pac | |
self._input_dim = input_dim | |
self._model = [self._reshape_func] | |
dim = input_dim * self._pac | |
for layer_dim in list(dis_dims): | |
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim)), LeakyReLU(0.2), Dropout(0.5)] | |
dim = layer_dim | |
layer_dim = 1 | |
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim))] | |
def _reshape_func(self, inputs, **kwargs): | |
dims = inputs.get_shape().as_list() | |
return tf.reshape(inputs, [-1, dims[1] * self._pac]) | |
def call(self, inputs, **kwargs): | |
outputs = inputs | |
for layer in self._model: | |
outputs = layer(outputs, **kwargs) | |
return outputs | |
import keras | |
class ProgressBar(keras.utils.generic_utils.Progbar): | |
@classmethod | |
def _get_terminal_width(cls): | |
width = shutil.get_terminal_size(fallback=(200, 24))[0] | |
return width if width != 0 else 120 | |
def __init__(self, total_samples, batch_size, epoch, num_epochs, metrics): | |
self._seen = 0 | |
self.log_values = [] | |
postfix = {m: f'{0:6.3f}' for m in metrics} | |
postfix[1] = 1 | |
str_format = '{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' \ | |
'ETA: {remaining} Elapsed Time: {elapsed} ' + \ | |
reduce(lambda x, y: x + y, | |
["%s:{postfix[%s]} " % (m, m) for m in metrics], | |
"") | |
super(ProgressBar, self).__init__( | |
target=(total_samples // batch_size) * batch_size) | |
self._batch_size = batch_size | |
def _update(self, metrics): | |
new_metrics = {} | |
for met in metrics: new_metrics[met] = metrics[met].result() | |
for k in new_metrics: | |
self.log_values.append((k, new_metrics[k])) | |
self._seen += self._batch_size | |
super(ProgressBar, self).update(self._seen, self.log_values) | |
def gradient_penalty(func, real, fake, pac=10, gp_lambda=10.0): | |
alpha = tf.random.uniform([real.shape[0] // pac, 1, 1], 0., 1.) | |
alpha = tf.tile(alpha, tf.constant([1, pac, real.shape[1]], tf.int32)) | |
alpha = tf.reshape(alpha, [-1, real.shape[1]]) | |
interpolates = alpha * real + ((1 - alpha) * fake) | |
with tf.GradientTape() as tape: | |
tape.watch(interpolates) | |
pred = func(interpolates) | |
grad = tape.gradient(pred, [interpolates])[0] | |
grad = tf.reshape(grad, tf.constant([-1, pac * real.shape[1]], tf.int32)) | |
slopes = tf.math.reduce_euclidean_norm(grad, axis=1) | |
return tf.reduce_mean((slopes - 1.) ** 2) * gp_lambda | |
def conditional_loss(cond_info, data, cond, mask): | |
shape = tf.shape(mask) | |
c_loss = tf.zeros(shape) | |
for item in cond_info: | |
data_logsoftmax = data[:, item[0]:item[1]] | |
cond_vec = tf.math.argmax(cond[:, item[2]:item[3]], 1) | |
loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits(cond_vec, data_logsoftmax), [-1, 1]) | |
c_loss = tf.concat([c_loss[:, :item[-1]], loss, c_loss[:, item[-1]+1:]], 1) | |
return tf.reduce_sum(c_loss * mask) / tf.cast(shape[0], dtype=tf.float32) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment