Skip to content

Instantly share code, notes, and snippets.

@xaviliz
Last active March 30, 2025 19:00
Show Gist options
  • Select an option

  • Save xaviliz/775d34afaa2a5746c71310d9556606a8 to your computer and use it in GitHub Desktop.

Select an option

Save xaviliz/775d34afaa2a5746c71310d9556606a8 to your computer and use it in GitHub Desktop.
Some examples of how to infer audio with the transfer learning-based arousal-valence models trained with DEAM dataset in Essentia.
# An example to infer arousal valence models with audio files in Essentia using the
# `deam-effnet-discogs-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).
import json
from essentia import Pool
from essentia.standard import (
MonoLoader,
TensorflowPredict,
TensorflowPredictEffnetDiscogs,
TensorflowPredictMusiCNN,
TensorflowPredictVGGish,
)
# Model files for inference of embeddings and arousal/valence.
av_model_path = "deam-effnet-discogs-1.pb"
embeddings_model_path = "effnet-discogs-1.pb"
# Load an audio file.
# The embeddings models work with input audio with the 16 KHz samplerate.
audio_path = "0rVJLlfkMJh6YOWAYmD0v3.mp3"
audio = MonoLoader(filename=audio_path, sampleRate=16000)()
# For configuring MusiCNN and VGGish embeddings, see:
# https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html
# Parameters for the Effnet-Discogs embeddings model. It requires patch size
# and patch hop size different from MusiCNN.
# The patch_size defines the number of melspectrogram frames the model needs
# to return an embedding vector. Effnet model needs 128 frames and
# each frame is extracted with a 256-samples hopsize.
# That's why the receptive field of Effnet is 2-seconds, 128 * 256/ 16000 ≈ 2 seconds,
# and it differs of MusiCNN which utilizes 187 frames (3-seconds).
patch_size = 128
# The patch_hop_size defines the number of feature frames in between successive embedding.
# It defines the time interval analysed at each embedding vector and it can vary for each model.
# Effnet applies 64 patches, close to 1-seconds (64 * 256 / 16000),
# whereas MusiCNN applies 1.5-seconds (93 * 256/ 16000 ≈ 1.5)
patch_hop_size = patch_size // 2
# Although Effnet-Discogs was published recently in
# https://essentia.upf.edu/models/music-style-classification/discogs-effnet/
# deam-effnet-discogs-1 model was trained with a previous version.
# NOTE: The embedding size and the layer names differ among versions.
input_layer = "melspectrogram"
output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1"
# Essentia has recently included a specific algorithm to predict with
# the Effnet model (TensorflowPredictEffnetDiscogs -
# https://github.com/MTG/essentia/commit/919819a4708aca8f7b7156126ecc5c2a095fc9de).
# However, we also have the choice to infer this model with TensorflowPredictMusiCNN,
# as we showed in a previous gist https://gist.github.com/xaviliz/f5271c894725af9aaf57cf532f97ea13
embeddings_model = TensorflowPredictEffnetDiscogs(
graphFilename=embeddings_model_path,
input=input_layer,
output=output_layer,
patchSize=patch_size,
patchHopSize=patch_hop_size,
)
# Compute embeddings.
embeddings = embeddings_model(audio)
# Now we need to load the arousal-valence model and run inference with
# TensorflowPredict().
# First we need to configure the input and output layers for this model.
# This information is available in the json file with model metadata.
metadata = json.load(open("deam-effnet-discogs-1.json", "r"))
input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]
# Instantiate the arousal-valence model
# (again, we can instantiate once and then compute on many different inputs).
av_model = TensorflowPredict(
graphFilename=av_model_path,
inputs=[input_layer],
outputs=[output_layer],
)
# Run inference.
# TensorflowPredict works with an input Essentia Pool. Typically you won't use
# the TensorFlowPredict algorithm directly with many of our models as we
# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
# the case of the new arousal-valence models, so we need to manually prepare
# the input features.
feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
pool = Pool()
pool.set(input_layer, feature)
predictions = av_model(pool)[output_layer].squeeze()
# We can estimate the average of the predictions to get an arousal-valence
# representation for the entire song.
print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]
# An example to infer arousal valence models with audio files in Essentia using the
# `deam-musicnn-msd-1.pb` model (DEAM dataset, MusiCNN-MSD embeddings).
# Model files for inference of embeddings and arousal/valence.
av_model_path = "deam-musicnn-msd-1.pb"
embeddings_model_path = "msd-musicnn-1.pb"
# Parameters for the MusiCNN-MSD embeddings model.
input_layer = "model/Placeholder"
output_layer = "model/dense/BiasAdd"
# Instantiate the embeddings model
# (we can instantiate once and then compute on many different inputs).
embeddings_model = TensorflowPredictMusiCNN(
graphFilename=embeddings_model_path,
input=input_layer,
output=output_layer,
)
# Compute embeddings.
embeddings = embeddings_model(audio)
# Now we need to load the arousal-valence model and run inference with
# TensorflowPredict().
# First we need to configure the input and output layers for this model.
# This information is available in the json file with model metadata.
metadata = json.load(open("deam-musicnn-msd-1.json", "r"))
input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]
# Instantiate the arousal-valence model
# (again, we can instantiate once and then compute on many different inputs).
av_model = TensorflowPredict(
graphFilename=av_model_path,
inputs=[input_layer],
outputs=[output_layer],
)
# Run inference.
# TensorflowPredict works with an input Essentia Pool. Typically you won't use
# the TensorFlowPredict algorithm directly with many of our models as we
# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
# the case of the new arousal-valence models, so we need to manually prepare
# the input features.
feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
pool = Pool()
pool.set(input_layer, feature)
predictions = av_model(pool)[output_layer].squeeze()
# We can estimate the average of the predictions to get an arousal-valence
# representation for the entire song.
print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]
# An example to infer arousal valence models with audio files in Essentia using the
# `deam-vggish-audioset-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).
# Model files for inference of embeddings and arousal/valence.
av_model_path = "deam-vggish-audioset-1.pb"
embeddings_model_path = "audioset-vggish-3.pb"
# For configuring MusiCNN and VGGish embeddings, see:
# https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html
# Parameters for the VGGish embeddings model. It works in time domain,
# it doesn't needs to specify patch_size and patch_hop_size,
# only output_layer name.
output_layer = "model/vggish/embeddings"
# Instantiate the embeddings model
# (we can instantiate once and then compute on many different inputs).
embeddings_model = TensorflowPredictVGGish(
graphFilename=embeddings_model_path,
output=output_layer,
)
# Compute embeddings.
embeddings = embeddings_model(audio)
# Now we need to load the arousal-valence model and run inference with
# TensorflowPredict().
# First we need to configure the input and output layers for this model.
# This information is available in the json file with model metadata.
metadata = json.load(open("deam-vggish-audioset-1.json", "r"))
input_layer = metadata["schema"]["inputs"][0]["name"]
output_layer = metadata["schema"]["outputs"][0]["name"]
# Instantiate the arousal-valence model
# (again, we can instantiate once and then compute on many different inputs).
av_model = TensorflowPredict(
graphFilename=av_model_path,
inputs=[input_layer],
outputs=[output_layer],
)
# Run inference.
# TensorflowPredict works with an input Essentia Pool. Typically you won't use
# the TensorFlowPredict algorithm directly with many of our models as we
# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
# the case of the new arousal-valence models, so we need to manually prepare
# the input features.
feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
pool.set(input_layer, feature)
predictions = av_model(pool)[output_layer].squeeze()
# We can estimate the average of the predictions to get an arousal-valence
# representation for the entire song.
print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment