music_trans_8k.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.python.framework import ops
ops.reset_default_graph()

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import librosa
import wave
import pylab


from os import listdir
from os.path import isfile, join


STEPS_PER_EPOCH = 63

# hyper-parameters
MU = 256
LATENT_DIM = 64
POOL_SIZE = 400

'''
INSTRUMENTS FILE LOAD
'''
#inst_paths can be altered according to where you restore your audio, the audio files are all 44.1kHz wav files
inst_paths = ['./raw_1/pia', 
              './raw_1/cla', 
              './raw_1/gac'] 

inst_files_list = []
for inst_path in inst_paths:
    inst_files = [join(inst_path, f) for f in listdir(inst_path) if isfile(join(inst_path, f)) and f.endswith('wav')]
    inst_files_list.append(inst_files)

'''
INSTRUMENTS AUDIO LOAD
'''
#The length of data we choose, for 8kHz file, it is about 1.25 seconds
T = 10000

inst_waves_list = []
# You can change 44100 to the sampling rate you use, we resample the audio data to 8k for data pre-processing
inst_index = 0
for inst_files in inst_files_list:
    
    waves = []
    for inst_file in inst_files:
        audio_44k, sr = librosa.load(inst_file, 44100)
        audio_8k=librosa.resample(y=audio_44k,orig_sr=sr,target_sr=8000)
        wave0 = np.transpose((audio_8k[0:T])) 
        #frames = wav.readframes(-1)
        #wave = np.load(inst_file)
        if not len(wave0) > 0 or len(wave0) % T != 0:
            continue
        A = max(wave0)     
        wave0 = np.float32(wave0/A)          
        wave0 = np.expand_dims(wave0, axis=0)
        wave0 = np.nan_to_num(wave0)

        for i in range(wave0.shape[0]):
            waves.append(wave0[i])
        
    waves = np.stack(waves)
    inst_waves_list.append(waves)
    inst_index += 1
    
for waves in inst_waves_list:
    print(waves.shape)
    
INSTRUMENTS_NUM = len(inst_waves_list)
'''
DEFINE WAVENET FUNTIONS
'''
#The layers we use, they are almost the same with the original code
def mulaw(x, MU):
    return tf.sign(x) * tf.log(1. + MU * tf.abs(x)) / tf.log(1. + MU)

def inv_mulaw(x, MU):
    return tf.sign(x) * (1. / MU) * (tf.pow(1. + MU, tf.abs(x)) - 1.)
    
def naive_wavenet(inputs, condition, layers, h_filters, out_filters, name='naive_wavenet', reuse=False):
    with tf.variable_scope(name, reuse=reuse):
        
        outputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        outputs = tf.layers.conv1d(inputs=outputs, filters=h_filters, 
                                       kernel_size=2, dilation_rate=1, use_bias=False)
        dilation_sum = 1
        skips = []
        #for _ in range(2):
        for layer in range(layers):
            dilation = 2 ** layer
            dilation_sum += dilation
            layer_outputs = tf.pad(outputs, [[0, 0], [dilation, 0], [0, 0]])
            filter_outputs = tf.layers.conv1d(inputs=layer_outputs, filters=h_filters, 
                                       kernel_size=2, dilation_rate=dilation, use_bias=False)
            gate_outputs = tf.layers.conv1d(inputs=layer_outputs, filters=h_filters, 
                                       kernel_size=2, dilation_rate=dilation, use_bias=False)
            if condition is not None:
                filter_condition = tf.layers.dense(condition, h_filters)
                gate_condition = tf.layers.dense(condition, h_filters)
            else:
                filter_condition = 0
                gate_condition = 0

            layer_outputs = tf.nn.tanh(filter_outputs + filter_condition) * \
                            tf.nn.sigmoid(gate_outputs + gate_condition)

            residual = tf.layers.dense(layer_outputs, h_filters)
            outputs += residual

            skip = tf.layers.dense(layer_outputs, h_filters)
            skips.append(skip)

        outputs = tf.nn.relu(sum(skips))
        outputs = tf.layers.dense(outputs, out_filters, activation=tf.nn.relu)
        outputs = tf.layers.dense(outputs, out_filters, activation=None)

    return dilation_sum, outputs

def downsample(inputs, pool_size, channel):
    outputs = tf.layers.average_pooling1d(inputs=inputs, pool_size=pool_size, strides=pool_size)
    pad_size = 0
    return pad_size, outputs

def upsample(inputs, output_size, channel):
    outputs = tf.expand_dims(inputs, axis=1)
    outputs = tf.image.resize_nearest_neighbor(outputs, [1, output_size])
    outputs = tf.squeeze(outputs, axis=1)
    outputs = tf.reshape(outputs, [tf.shape(outputs)[0], tf.shape(outputs)[1], channel])
    return outputs[:, -output_size:]

#We name the variables in the domain confusion layer. Very important!
def domain_confusion(inputs, layers, domain_num, h_filters):
    with tf.variable_scope("discriminator"):
        outputs = inputs
        for layer in range(layers):
            dilation = 2 ** layers
            outputs = tf.layers.conv1d(inputs=outputs, filters=h_filters, kernel_size=2, 
                                       dilation_rate=1, activation=tf.nn.elu)
        
        outputs = tf.layers.dense(outputs, domain_num, activation=tf.nn.tanh)
        outputs = tf.layers.dense(outputs, domain_num)
        return outputs

class FlipGradientBuilder(object):
    def __init__(self):
        self.num_calls = 0

    def __call__(self, x, l=1.0):
        grad_name = "flipGradient%d" % self.num_calls
        @ops.RegisterGradient(grad_name)
        def _flip_gradients(op, grad):
            return [tf.negative(grad) * l]
        
        g = tf.get_default_graph()
        with g.gradient_override_map({"Identity": grad_name}):
            y = tf.identity(x)
            
        self.num_calls += 1
        return y
    
flip_gradient1 = FlipGradientBuilder()

#We make serval imprtant changes in graph draw

'''

DRAW GRAPH

'''

tf.reset_default_graph()

'''

INPUT LAYER

'''
# wave input
x_holder = tf.placeholder(dtype=tf.float32, shape=[None, None])
x_mulaw = mulaw(x_holder, MU)
x_onehot_index = tf.clip_by_value(tf.cast((x_mulaw + 1.) * 0.5 * MU, tf.int32), 0, MU - 1)
x_onehot = tf.one_hot(x_onehot_index, depth=MU)

# label input
label_holder = tf.placeholder(dtype=tf.int32, shape=())

'''

ENCODER LAYER

'''

# encode
_, latents = naive_wavenet(inputs=tf.expand_dims(x_holder, axis=-1), condition=None, 
                           layers=9, h_filters=64, out_filters=LATENT_DIM, name='wavenet_encoder')

# downsample
_, down_latents = downsample(latents, POOL_SIZE, LATENT_DIM)

# upsample
up_latents = upsample(down_latents, tf.shape(x_holder)[1], LATENT_DIM)

'''

DOMAIN CONFUSION LAYER

'''

# gradient reversal layer
flipped_down_latents = flip_gradient1(down_latents, l=1e-2)
#flipped_down_latents = down_latents

# domain predict
label_predicts = domain_confusion(flipped_down_latents, 3, INSTRUMENTS_NUM, 128)
label_predicts = tf.reduce_mean(label_predicts, axis=1)
label_predicts_prob = tf.nn.softmax(label_predicts)
label_tiled = tf.tile(tf.expand_dims(label_holder, axis=0), [tf.shape(label_predicts)[0]])

# loss
domain_confusion_loss = tf.losses.sparse_softmax_cross_entropy(labels=label_tiled, logits=label_predicts)


#Change 1: According to facebook's paper, we believe the domain confusion layer should be trained seperately
#Only the discriminator will be trained in this step
D_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'discriminator')
train_step_0 = tf.train.AdamOptimizer(1e-3).minimize(domain_confusion_loss, var_list = D_vars)

domain_confusion_loss = tf.losses.sparse_softmax_cross_entropy(labels=label_tiled, logits=label_predicts)
D_extra_step = tf.get_collection(tf.GraphKeys.UPDATE_OPS, 'discriminator')


'''

DECODER LAYER for traininng

'''
decode_losses = []
samples_list = []
for instrument_index in range(INSTRUMENTS_NUM):
    # decode
    dilation_sum, outputs = naive_wavenet(inputs=x_onehot, condition=up_latents, 
                                          layers=9, h_filters=64, out_filters=MU, 
                                          name='wavenet_decoder_' + str(instrument_index))
    outputs_probs = tf.nn.softmax(outputs)
    
    # sample from outputs
    dist = tf.distributions.Categorical(probs=outputs_probs)
    samples = inv_mulaw(tf.cast(dist.sample(), tf.float32) / MU * 2. - 1., MU)
    samples_list.append(samples)

    # loss
    decode_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=x_onehot_index[:, dilation_sum + 1:],
                                                                 logits=outputs[:, dilation_sum:-1])
    decode_loss = tf.reduce_mean(decode_loss)
    decode_losses.append(decode_loss)

decode_losses = tf.stack(decode_losses, axis=0) * tf.one_hot(label_holder, depth=INSTRUMENTS_NUM)
decode_losses = tf.reduce_mean(decode_losses)

#The regularization term should be negative.
#We only train the generator (encoder and decoder) in this step
loss = decode_losses - 0.02*domain_confusion_loss
G_vars_0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'wavenet_encoder') 
G_vars_1= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'wavenet_decoder') 

train_step = tf.train.AdamOptimizer(1e-3).minimize(loss, var_list = [G_vars_0,G_vars_1])
G_extra_step_0 = tf.get_collection(tf.GraphKeys.UPDATE_OPS, 'wavenet_encoder')
G_extra_step_1 = tf.get_collection(tf.GraphKeys.UPDATE_OPS, 'wavenet_decoder')

'''

DECODER LAYER for inference

'''

# input for decoder
latents_holder = tf.placeholder(dtype=tf.float32, shape=[None, None, LATENT_DIM])

inference_sample_list = []

for instrument_index in range(INSTRUMENTS_NUM):
    # decode
    _, outputs = naive_wavenet(inputs=x_onehot, condition=latents_holder, 
                               layers=9, h_filters=64, out_filters=MU, 
                               name='wavenet_decoder_' + str(instrument_index), reuse=True)
    outputs_probs = tf.nn.softmax(outputs)

    # sample from outputs
    dist = tf.distributions.Categorical(probs=outputs_probs[:, -1])
    sample = inv_mulaw(tf.cast(dist.sample(), tf.float32) / MU * 2. - 1., MU)
    inference_sample_list.append(sample)

'''

SESSION CREATE

'''
'''tf.device('/gpu:0')
config = tf.ConfigProto(
         device_count = {'GPU': 0}
     )
sess = tf.Session(config=config)
'''
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
#sess = tf.Session()
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()
# Restore variables from disk.
saver.restore(sess, "./music_trans_8k/model225.ckpt")
print("Model restored.")

print('Tensorflow graph created.')

'''

TRAINING

'''

from IPython.display import clear_output

### data augmetation
def pitch_shift(inputs, start_index, end_index, n_steps):
    shifted = librosa.effects.pitch_shift(inputs[start_index:end_index], 8000, n_steps)
    outputs = np.concatenate([inputs[:start_index], shifted, inputs[end_index:]], axis=0)
    return outputs

def wave_augmentation(inputs):
    length = np.random.randint(2000, 4000, 1)[0]
    start_index = np.random.randint(0, len(inputs) - length, 1)[0]
    end_index = start_index + length
    n_steps = float(np.random.ranf(1)[0] - 0.5)
    return pitch_shift(inputs, start_index, end_index, n_steps)

_epoch = 10
average_loss = 0

while(_epoch < 1000):
   
    ### TRAINING
     for i in range(STEPS_PER_EPOCH):
        for instrument_index in range(INSTRUMENTS_NUM):
            batch_size = 12
            indexes = np.random.randint(0, inst_waves_list[instrument_index].shape[0], batch_size)
            augmented = []
            for _wave in inst_waves_list[instrument_index][indexes]:
                augmented.append(wave_augmentation(_wave))
                
            augmented = np.stack(augmented, axis=0)
            _,_, _loss = sess.run([train_step_0,train_step, loss], feed_dict={x_holder: augmented,
                                                     label_holder: instrument_index})
            
            average_loss = 0.99 * average_loss + 0.01 * _loss
            print('step : ',i ,'instrument : ', instrument_index, 'loss : ', _loss, 'average_loss : ', average_loss)
        
    
     _epoch += 1  
     if (_epoch % 25 == 0):
       save_path = saver.save(sess, "music_trans_8k/model" + str(_epoch) + ".ckpt")
       print("Model saved in path: %s" % save_path)