Data Augmentation using cGAN

The idea is to generate new and realistic fetures based on labels. GANs are excellent at generating realistic data. We can condition this generation by using Conditional Generative Adversarial Networks

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
latent_dim = 100 # dimension of the latent space
n_samples = 1000 # size of our dataset
n_classes = 3
n_features = 2 # we use 2 features since we'd like to visualize them

We start by creating random clusters of points, n_classes, with features, n_features. We make use of make_blobs from scikit learn that generates gaussian blobs

from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=123)

print('Size of our dataset:', len(X))
print('Number of features:', X.shape[1])
print('Classes:', set(y))
Size of our dataset: 1000
Number of features: 2
Classes: {0, 1, 2}

Following we normalize our features to help with the learning

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))

scaled_X = scaler.fit_transform(X)
fig, ax = plt.subplots(figsize=(15, 4))
legend = []

for i in range(n_classes):
    plt.scatter(scaled_X[:, 0][np.where(y==i)], scaled_X[:, 1][np.where(y==i)], )
    legend.append('Class %d' % i)

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
# Core layers
from keras.layers \
    import Activation, Dropout, Flatten, Dense, Input, LeakyReLU

# Normalization layers
from keras.layers import BatchNormalization

# Merge layers
from keras.layers import concatenate, multiply

# Embedding Layers
from keras.layers import Embedding

# Keras models
from keras.models import Model, Sequential

# Keras optimizers
from keras.optimizers import Adam, RMSprop, SGD
def build_discriminator(optimizer=Adam(0.0002, 0.5)):
    Defines and compiles discriminator model.
    This architecture has been inspired by:
    and adapted for this problem.
        optimizer=Adam(0.0002, 0.5) - recommended values
    features = Input(shape=(n_features,))
    label = Input(shape=(1,), dtype='int32')
    # Using an Embedding layer is recommended by the papers
    label_embedding = Flatten()(Embedding(n_classes, n_features)(label))
    # We condition the discrimination of generated features 
    inputs = multiply([features, label_embedding])
    x = Dense(512)(inputs)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dropout(0.4)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dropout(0.4)(x)
    valid = Dense(1, activation='sigmoid')(x)
    model = Model([features, label], valid)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model
def build_generator():
    Defines the generator model.
    This architecture has been inspired by:
    and adapted for this problem.
    noise = Input(shape=(latent_dim,))
    label = Input(shape=(1,), dtype='int32')
    # Using an Embedding layer is recommended by the papers
    label_embedding = Flatten()(Embedding(n_classes, latent_dim)(label))
    # We condition the generation of features
    inputs = multiply([noise, label_embedding])
    x = Dense(256)(inputs)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization(momentum=0.8)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization(momentum=0.8)(x)
    x = Dense(1024)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization(momentum=0.8)(x)
    features = Dense(n_features, activation='tanh')(x)
    model = Model([noise, label], features)

    return model
def build_gan(generator, discriminator, optimizer=Adam(0.0002, 0.5)):
    Defines and compiles GAN model. It bassically chains Generator
    and Discriminator in an assembly-line sort of way where the input is
    the Generator's input. The Generator's output is the input of the Discriminator,
    which outputs the output of the whole GAN.
        optimizer=Adam(0.0002, 0.5) - recommended values
    noise = Input(shape=(latent_dim,))
    label = Input(shape=(1,))
    features = generator([noise, label])
    valid = discriminator([features, label])
    # We freeze the discriminator's layers since we're only 
    # interested in the generator and its learning
    discriminator.trainable = False
    model = Model([noise, label], valid)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model
discriminator = build_discriminator()
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1)            0                                            
embedding_1 (Embedding)         (None, 1, 2)         6           input_2[0][0]                    
input_1 (InputLayer)            (None, 2)            0                                            
flatten_1 (Flatten)             (None, 2)            0           embedding_1[0][0]                
multiply_1 (Multiply)           (None, 2)            0           input_1[0][0]                    
dense_1 (Dense)                 (None, 512)          1536        multiply_1[0][0]                 
leaky_re_lu_1 (LeakyReLU)       (None, 512)          0           dense_1[0][0]                    
dense_2 (Dense)                 (None, 512)          262656      leaky_re_lu_1[0][0]              
leaky_re_lu_2 (LeakyReLU)       (None, 512)          0           dense_2[0][0]                    
dropout_1 (Dropout)             (None, 512)          0           leaky_re_lu_2[0][0]              
dense_3 (Dense)                 (None, 512)          262656      dropout_1[0][0]                  
leaky_re_lu_3 (LeakyReLU)       (None, 512)          0           dense_3[0][0]                    
dropout_2 (Dropout)             (None, 512)          0           leaky_re_lu_3[0][0]              
dense_4 (Dense)                 (None, 1)            513         dropout_2[0][0]                  
Total params: 527,367
Trainable params: 527,367
Non-trainable params: 0
generator = build_generator()
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 1)            0                                            
embedding_2 (Embedding)         (None, 1, 100)       300         input_4[0][0]                    
input_3 (InputLayer)            (None, 100)          0                                            
flatten_2 (Flatten)             (None, 100)          0           embedding_2[0][0]                
multiply_2 (Multiply)           (None, 100)          0           input_3[0][0]                    
dense_5 (Dense)                 (None, 256)          25856       multiply_2[0][0]                 
leaky_re_lu_4 (LeakyReLU)       (None, 256)          0           dense_5[0][0]                    
batch_normalization_1 (BatchNor (None, 256)          1024        leaky_re_lu_4[0][0]              
dense_6 (Dense)                 (None, 512)          131584      batch_normalization_1[0][0]      
leaky_re_lu_5 (LeakyReLU)       (None, 512)          0           dense_6[0][0]                    
batch_normalization_2 (BatchNor (None, 512)          2048        leaky_re_lu_5[0][0]              
dense_7 (Dense)                 (None, 1024)         525312      batch_normalization_2[0][0]      
leaky_re_lu_6 (LeakyReLU)       (None, 1024)         0           dense_7[0][0]                    
batch_normalization_3 (BatchNor (None, 1024)         4096        leaky_re_lu_6[0][0]              
dense_8 (Dense)                 (None, 2)            2050        batch_normalization_3[0][0]      
Total params: 692,270
Trainable params: 688,686
Non-trainable params: 3,584
gan = build_gan(generator, discriminator)
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 100)          0                                            
input_6 (InputLayer)            (None, 1)            0                                            
model_2 (Model)                 (None, 2)            692270      input_5[0][0]                    
model_1 (Model)                 (None, 1)            527367      model_2[1][0]                    
Total params: 1,219,637
Trainable params: 688,686
Non-trainable params: 530,951
def get_random_batch(X, y, batch_size):
    Will return random batches of size batch_size
        X: numpy array - features
        y: numpy array - classes
        batch_size: Int
    idx = np.random.randint(0, len(X))
    X_batch = X[idx:idx+batch_size]
    y_batch = y[idx:idx+batch_size]
    return X_batch, y_batch
def train_gan(gan, generator, discriminator, 
              X, y, 
              n_epochs=1000, batch_size=32, 
              hist_every=10, log_every=100):
    Trains discriminator and generator (last one through the GAN) 
    separately in batches of size batch_size. The training goes as follow:
        1. Discriminator is trained with real features from our training data
        2. Discriminator is trained with fake features generated by the Generator
        3. GAN is trained, which will only change the Generator's weights.
        gan: GAN model
        generator: Generator model
        discriminator: Discriminator model
        X: numpy array - features
        y: numpy array - classes
        n_epochs: Int
        batch_size: Int
        hist_every: Int - will save the training loss and accuracy every hist_every epochs
        log_every: Int - will output the loss and accuracy every log_every epochs
        loss_real_hist: List of Floats
        acc_real_hist: List of Floats
        loss_fake_hist: List of Floats
        acc_fake_hist: List of Floats
        loss_gan_hist: List of Floats
        acc_gan_hist: List of Floats
    half_batch = int(batch_size / 2)
    acc_real_hist = []
    acc_fake_hist = []
    acc_gan_hist = []
    loss_real_hist = []
    loss_fake_hist = []
    loss_gan_hist = []
    for epoch in range(n_epochs):
        X_batch, labels = get_random_batch(X, y, batch_size)
        # train with real values
        y_real = np.ones((X_batch.shape[0], 1))
        loss_real, acc_real = discriminator.train_on_batch([X_batch, labels], y_real)
        # train with fake values
        noise = np.random.uniform(0, 1, (labels.shape[0], latent_dim))
        X_fake = generator.predict([noise, labels])
        y_fake = np.zeros((X_fake.shape[0], 1))
        loss_fake, acc_fake = discriminator.train_on_batch([X_fake, labels], y_fake)
        y_gan = np.ones((labels.shape[0], 1))
        loss_gan, acc_gan = gan.train_on_batch([noise, labels], y_gan)
        if (epoch+1) % hist_every == 0:

        if (epoch+1) % log_every == 0:
            lr = 'loss real: {:.3f}'.format(loss_real)
            ar = 'acc real: {:.3f}'.format(acc_real)
            lf = 'loss fake: {:.3f}'.format(loss_fake)
            af = 'acc fake: {:.3f}'.format(acc_fake)
            lg = 'loss gan: {:.3f}'.format(loss_gan)
            ag = 'acc gan: {:.3f}'.format(acc_gan)

            print('{}, {} | {}, {} | {}, {}'.format(lr, ar, lf, af, lg, ag))
    return loss_real_hist, acc_real_hist, loss_fake_hist, acc_fake_hist, loss_gan_hist, acc_gan_hist
loss_real_hist, acc_real_hist, \
loss_fake_hist, acc_fake_hist, \
loss_gan_hist, acc_gan_hist = train_gan(gan, generator, discriminator, scaled_X, y)
loss real: 0.371, acc real: 1.000 | loss fake: 0.238, acc fake: 1.000 | loss gan: 1.901, acc gan: 0.000
loss real: 0.080, acc real: 0.969 | loss fake: 0.021, acc fake: 1.000 | loss gan: 5.036, acc gan: 0.094
loss real: 0.701, acc real: 0.781 | loss fake: 0.832, acc fake: 0.344 | loss gan: 1.857, acc gan: 0.469
loss real: 0.704, acc real: 0.406 | loss fake: 0.727, acc fake: 0.250 | loss gan: 0.715, acc gan: 0.281
loss real: 0.704, acc real: 0.438 | loss fake: 0.702, acc fake: 0.406 | loss gan: 0.694, acc gan: 0.438
loss real: 0.702, acc real: 0.375 | loss fake: 0.703, acc fake: 0.438 | loss gan: 0.701, acc gan: 0.438
loss real: 0.695, acc real: 0.438 | loss fake: 0.698, acc fake: 0.438 | loss gan: 0.702, acc gan: 0.406
loss real: 0.713, acc real: 0.219 | loss fake: 0.685, acc fake: 0.531 | loss gan: 0.699, acc gan: 0.375
loss real: 0.703, acc real: 0.281 | loss fake: 0.697, acc fake: 0.562 | loss gan: 0.702, acc gan: 0.312
loss real: 0.687, acc real: 0.667 | loss fake: 0.715, acc fake: 0.185 | loss gan: 0.676, acc gan: 0.704
ax, fig = plt.subplots(figsize=(15, 6))
plt.title('Training loss over time')
plt.legend(['Loss real', 'Loss fake', 'Loss GAN'])
ax, fig = plt.subplots(figsize=(15, 6))
plt.title('Training accuracy over time')
plt.legend(['Acc real', 'Acc fake', 'Acc GAN'])
def generate_samples(class_for, n_samples=20):
    Generates new random but very realistic features using
    a trained generator model
        class_for: Int - features for this class
        n_samples: Int - how many samples to generate
    noise = np.random.uniform(0, 1, (n_samples, latent_dim))
    label = np.full((n_samples,), fill_value=class_for)
    return generator.predict([noise, label])

Let's generate new features for class 0

In [19]:
features_class_0 = generate_samples(0)
def visualize_fake_features(fake_features, figsize=(15, 6), color='r'):
    ax, fig = plt.subplots(figsize=figsize)
    # Let's plot our dataset to compare
    for i in range(n_classes):
        plt.scatter(scaled_X[:, 0][np.where(y==i)], scaled_X[:, 1][np.where(y==i)])

    plt.scatter(fake_features[:, 0], fake_features[:, 1], c=color)
    plt.title('Real and fake features')
    plt.legend(['Class 0', 'Class 1', 'Class 2', 'Fake'])
New features for class 1

features_class_1 = generate_samples(1)

New features for class 2

features_class_2 = generate_samples(2)
