Real-time inference with simultaneous essentia-tensorflow classifiers¶
We are relying on soundcard to capture the audio loopback of the system.
If not available it can be installed with pip
!pip install pysoundcard
%matplotlib nbagg
import numpy as np
from scipy.special import softmax
import matplotlib.pyplot as plt
import soundcard as sc
from struct import unpack
from IPython import display
from essentia.streaming import *
from essentia import Pool, run, array, reset
For this demo, we will use four of our transfer learning classifiers:
danceability
, voice_instrumental
, mood_aggressive
, and
mood_happy
. These and more models can be downloaded from Essentia
models site.
!wget https://essentia.upf.edu/models/classifiers/danceability/danceability-musicnn-msd-1.pb
!wget https://essentia.upf.edu/models/classifiers/voice_instrumental/voice_instrumental-musicnn-msd-1.pb
!wget https://essentia.upf.edu/models/classifiers/mood_aggressive/mood_aggressive-musicnn-msd-1.pb
!wget https://essentia.upf.edu/models/classifiers/mood_happy/mood_happy-musicnn-msd-1.pb
Parameters optimized for real-time inference¶
# model parameters
inputLayer = 'model/Placeholder'
outputLayer = 'model/Sigmoid'
labels = ['danceability', 'voice_instrumental', 'aggressiveness', 'happiness']
nLabels = len(labels)
sampleRate = 16000
frameSize = 512
hopSize = 256
numberBands = 96
# analysis parameters
patchSize = 64
displaySize = 10
bufferSize = patchSize * hopSize
Instantiate and connect the algorithms¶
buffer = np.zeros(bufferSize, dtype='float32')
vimp = VectorInput(buffer)
fc = FrameCutter(frameSize=frameSize, hopSize=hopSize)
tim = TensorflowInputMusiCNN()
vtt = VectorRealToTensor(shape=[1, 1, patchSize, numberBands],
lastPatchMode='discard')
ttp = TensorToPool(namespace=inputLayer)
tfp_danceability = TensorflowPredict(graphFilename='danceability-musicnn-msd-1.pb',
inputs=[inputLayer],
outputs=[outputLayer])
tfp_voice_instrumental = TensorflowPredict(graphFilename='voice_instrumental-musicnn-msd-1.pb',
inputs=[inputLayer],
outputs=[outputLayer])
tfp_aggressive = TensorflowPredict(graphFilename='mood_aggressive-musicnn-msd-1.pb',
inputs=[inputLayer],
outputs=[outputLayer])
tfp_happy = TensorflowPredict(graphFilename='mood_happy-musicnn-msd-1.pb',
inputs=[inputLayer],
outputs=[outputLayer])
ptt_danceability = PoolToTensor(namespace=outputLayer)
ptt_voice_instrumental = PoolToTensor(namespace=outputLayer)
ptt_aggressive = PoolToTensor(namespace=outputLayer)
ptt_happy = PoolToTensor(namespace=outputLayer)
ttv_danceability = TensorToVectorReal()
ttv_voice_instrumental = TensorToVectorReal()
ttv_aggressive = TensorToVectorReal()
ttv_happy = TensorToVectorReal()
pool = Pool()
vimp.data >> fc.signal
fc.frame >> tim.frame
tim.bands >> vtt.frame
tim.bands >> (pool, 'melbands')
vtt.tensor >> ttp.tensor
ttp.pool >> tfp_danceability.poolIn
ttp.pool >> tfp_voice_instrumental.poolIn
ttp.pool >> tfp_aggressive.poolIn
ttp.pool >> tfp_happy.poolIn
tfp_danceability.poolOut >> ptt_danceability.pool
tfp_voice_instrumental.poolOut >> ptt_voice_instrumental.pool
tfp_aggressive.poolOut >> ptt_aggressive.pool
tfp_happy.poolOut >> ptt_happy.pool
ptt_danceability.tensor >> ttv_danceability.tensor
ptt_voice_instrumental.tensor >> ttv_voice_instrumental.tensor
ptt_aggressive.tensor >> ttv_aggressive.tensor
ptt_happy.tensor >> ttv_happy.tensor
ttv_danceability.frame >> (pool, 'danceability')
ttv_voice_instrumental.frame >> (pool, 'voice_instrumental')
ttv_aggressive.frame >> (pool, 'aggressive')
ttv_happy.frame >> (pool, 'happy')
Callback to update the plots¶
def callback(data):
# update audio buffer
buffer[:] =data.flatten()
# generate predictions
reset(vimp)
run(vimp)
# update mel and activation buffers
melBuffer[:] = np.roll(melBuffer, -patchSize)
melBuffer[:, -patchSize:] = pool['melbands'][-patchSize:,:].T
img_mel.set_data(melBuffer)
actBuffer[:] = np.roll(actBuffer, -1)
actBuffer[:, -1] = [pool['danceability'][-1, 0],
pool['voice_instrumental'][-1, 1],
pool['aggressive'][-1, 0],
pool['happy'][-1, 0]]
img_act.set_data(actBuffer)
# update plots
f.canvas.draw()
Start processing the loopback stream¶
# initialize plot buffers
melBuffer = np.zeros([numberBands, patchSize * displaySize])
actBuffer = np.zeros([nLabels, displaySize])
# reset storage
pool.clear()
# initialize plots
f, ax = plt.subplots(1, 2, figsize=[9.6, 7])
f.canvas.draw()
ax[0].set_title('Mel Bands')
img_mel = ax[0].imshow(melBuffer, aspect='auto',
origin='lower', vmin=0, vmax=6)
ax[0].set_xticks([])
ax[1].set_title('Activations')
img_act = ax[1].matshow(actBuffer, aspect='auto', vmin=0, vmax=1)
ax[1].set_xticks([])
ax[1].yaxis.set_ticks_position('right')
plt.yticks(np.arange(nLabels), labels, fontsize=8)
f.colorbar(img_act, ax=ax[1], orientation='horizontal')
# capture and process the speakers loopback
with sc.all_microphones(include_loopback=True)[0].recorder(samplerate=sampleRate) as mic:
while True:
callback(mic.record(numframes=bufferSize).mean(axis=1))