xaviliz · March 30, 2025 19:00
diff --git a/infer-arousal-valence-models-in-essentia.py b/infer-arousal-valence-models-in-essentia.py
 # An example to infer arousal valence models with audio files in Essentia using the
 # `deam-effnet-discogs-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).

 import json
 from essentia import Pool
 from essentia.standard import (
    MonoLoader,
    TensorflowPredict,
    TensorflowPredictEffnetDiscogs,
    TensorflowPredictMusiCNN,
    TensorflowPredictVGGish,
 )

 # Model files for inference of embeddings and arousal/valence.
 av_model_path = "deam-effnet-discogs-1.pb"
 embeddings_model_path = "effnet-discogs-1.pb"

 # Load an audio file.
 # The embeddings models work with input audio with the 16 KHz samplerate.
 audio_path = "0rVJLlfkMJh6YOWAYmD0v3.mp3"
 audio = MonoLoader(filename=audio_path, sampleRate=16000)()

 # For configuring MusiCNN and VGGish embeddings, see:
 # https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html

 # Parameters for the Effnet-Discogs embeddings model. It requires patch size
 # and patch hop size different from MusiCNN.

 # The patch_size defines the number of melspectrogram frames the model needs
 # to return an embedding vector. Effnet model needs 128 frames and
 # each frame is extracted with a 256-samples hopsize.
 # That's why the receptive field of Effnet is 2-seconds, 128 * 256/ 16000 ≈ 2 seconds,
 # and it differs of MusiCNN which utilizes 187 frames (3-seconds).
 patch_size = 128

 # The patch_hop_size defines the number of feature frames in between successive embedding.
 # It defines the time interval analysed at each embedding vector and it can vary for each model.
 # Effnet applies 64 patches, close to 1-seconds (64 * 256 / 16000),
 # whereas MusiCNN applies 1.5-seconds (93 * 256/ 16000 ≈ 1.5)
 patch_hop_size = patch_size // 2

 # Although Effnet-Discogs was published recently in
 # https://essentia.upf.edu/models/music-style-classification/discogs-effnet/
 # deam-effnet-discogs-1 model was trained with a previous version.
 # NOTE: The embedding size and the layer names differ among versions.

 input_layer = "melspectrogram"
 output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1"

 # Essentia has recently included a specific algorithm to predict with
 # the Effnet model (TensorflowPredictEffnetDiscogs -
 # https://github.com/MTG/essentia/commit/919819a4708aca8f7b7156126ecc5c2a095fc9de).
 # However, we also have the choice to infer this model with TensorflowPredictMusiCNN,
 # as we showed in a previous gist https://gist.github.com/xaviliz/f5271c894725af9aaf57cf532f97ea13

 embeddings_model = TensorflowPredictEffnetDiscogs(
    graphFilename=embeddings_model_path,
    input=input_layer,
    output=output_layer,
    patchSize=patch_size,
    patchHopSize=patch_hop_size,
 )

 # Compute embeddings.
 embeddings = embeddings_model(audio)

 # Now we need to load the arousal-valence model and run inference with
 # TensorflowPredict().

 # First we need to configure the input and output layers for this model.
 # This information is available in the json file with model metadata.
 metadata = json.load(open("deam-effnet-discogs-1.json", "r"))

 input_layer = metadata["schema"]["inputs"][0]["name"]
 output_layer = metadata["schema"]["outputs"][0]["name"]

 # Instantiate the arousal-valence model
 # (again, we can instantiate once and then compute on many different inputs).
 av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
 )

 # Run inference.

 # TensorflowPredict works with an input Essentia Pool. Typically you won't use
 # the TensorFlowPredict algorithm directly with many of our models as we
 # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
 # the case of the new arousal-valence models, so we need to manually prepare
 # the input features.
 feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
 pool = Pool()
 pool.set(input_layer, feature)

 predictions = av_model(pool)[output_layer].squeeze()

 # We can estimate the average of the predictions to get an arousal-valence
 # representation for the entire song.
 print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]


 # An example to infer arousal valence models with audio files in Essentia using the
 # `deam-musicnn-msd-1.pb` model (DEAM dataset, MusiCNN-MSD embeddings).

 # Model files for inference of embeddings and arousal/valence.
 av_model_path = "deam-musicnn-msd-1.pb"
 embeddings_model_path = "msd-musicnn-1.pb"

 # Parameters for the MusiCNN-MSD embeddings model. 
 input_layer = "model/Placeholder"
 output_layer = "model/dense/BiasAdd"

 # Instantiate the embeddings model
 # (we can instantiate once and then compute on many different inputs).
 embeddings_model = TensorflowPredictMusiCNN(
    graphFilename=embeddings_model_path,
    input=input_layer,
    output=output_layer,
 )

 # Compute embeddings.
 embeddings = embeddings_model(audio)

 # Now we need to load the arousal-valence model and run inference with
 # TensorflowPredict().

 # First we need to configure the input and output layers for this model.
 # This information is available in the json file with model metadata.
 metadata = json.load(open("deam-musicnn-msd-1.json", "r"))

 input_layer = metadata["schema"]["inputs"][0]["name"]
 output_layer = metadata["schema"]["outputs"][0]["name"]

 # Instantiate the arousal-valence model
 # (again, we can instantiate once and then compute on many different inputs).
 av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
 )

 # Run inference.

 # TensorflowPredict works with an input Essentia Pool. Typically you won't use
 # the TensorFlowPredict algorithm directly with many of our models as we
 # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
 # the case of the new arousal-valence models, so we need to manually prepare
 # the input features.
 feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
 pool = Pool()
 pool.set(input_layer, feature)

 predictions = av_model(pool)[output_layer].squeeze()

 # We can estimate the average of the predictions to get an arousal-valence
 # representation for the entire song.
 print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]


 # An example to infer arousal valence models with audio files in Essentia using the
 # `deam-vggish-audioset-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).

 # Model files for inference of embeddings and arousal/valence.
 av_model_path = "deam-vggish-audioset-1.pb"
 embeddings_model_path = "audioset-vggish-3.pb"

 # For configuring MusiCNN and VGGish embeddings, see:
 # https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html

 # Parameters for the VGGish embeddings model. It works in time domain,
 # it doesn't needs to specify patch_size and patch_hop_size,
 # only output_layer name.
 output_layer = "model/vggish/embeddings"

 # Instantiate the embeddings model
 # (we can instantiate once and then compute on many different inputs).
 embeddings_model = TensorflowPredictVGGish(
    graphFilename=embeddings_model_path,
    output=output_layer,
 )

 # Compute embeddings.
 embeddings = embeddings_model(audio)

 # Now we need to load the arousal-valence model and run inference with
 # TensorflowPredict().

 # First we need to configure the input and output layers for this model.
 # This information is available in the json file with model metadata.
 metadata = json.load(open("deam-vggish-audioset-1.json", "r"))

 input_layer = metadata["schema"]["inputs"][0]["name"]
 output_layer = metadata["schema"]["outputs"][0]["name"]

 # Instantiate the arousal-valence model
 # (again, we can instantiate once and then compute on many different inputs).
 av_model = TensorflowPredict(
    graphFilename=av_model_path,
    inputs=[input_layer],
    outputs=[output_layer],
 )

 # Run inference.

 # TensorflowPredict works with an input Essentia Pool. Typically you won't use
 # the TensorFlowPredict algorithm directly with many of our models as we
 # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
 # the case of the new arousal-valence models, so we need to manually prepare
 # the input features.
 feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
 pool.set(input_layer, feature)

 predictions = av_model(pool)[output_layer].squeeze()

 # We can estimate the average of the predictions to get an arousal-valence
 # representation for the entire song.
 print(f"prediction: {predictions.mean(axis=0)}")  # [valence, arousal]
	# An example to infer arousal valence models with audio files in Essentia using the
	# `deam-effnet-discogs-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).

	import json
	from essentia import Pool
	from essentia.standard import (
	MonoLoader,
	TensorflowPredict,
	TensorflowPredictEffnetDiscogs,
	TensorflowPredictMusiCNN,
	TensorflowPredictVGGish,
	)

	# Model files for inference of embeddings and arousal/valence.
	av_model_path = "deam-effnet-discogs-1.pb"
	embeddings_model_path = "effnet-discogs-1.pb"

	# Load an audio file.
	# The embeddings models work with input audio with the 16 KHz samplerate.
	audio_path = "0rVJLlfkMJh6YOWAYmD0v3.mp3"
	audio = MonoLoader(filename=audio_path, sampleRate=16000)()

	# For configuring MusiCNN and VGGish embeddings, see:
	# https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html

	# Parameters for the Effnet-Discogs embeddings model. It requires patch size
	# and patch hop size different from MusiCNN.

	# The patch_size defines the number of melspectrogram frames the model needs
	# to return an embedding vector. Effnet model needs 128 frames and
	# each frame is extracted with a 256-samples hopsize.
	# That's why the receptive field of Effnet is 2-seconds, 128 * 256/ 16000 ≈ 2 seconds,
	# and it differs of MusiCNN which utilizes 187 frames (3-seconds).
	patch_size = 128

	# The patch_hop_size defines the number of feature frames in between successive embedding.
	# It defines the time interval analysed at each embedding vector and it can vary for each model.
	# Effnet applies 64 patches, close to 1-seconds (64 * 256 / 16000),
	# whereas MusiCNN applies 1.5-seconds (93 * 256/ 16000 ≈ 1.5)
	patch_hop_size = patch_size // 2

	# Although Effnet-Discogs was published recently in
	# https://essentia.upf.edu/models/music-style-classification/discogs-effnet/
	# deam-effnet-discogs-1 model was trained with a previous version.
	# NOTE: The embedding size and the layer names differ among versions.

	input_layer = "melspectrogram"
	output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1"

	# Essentia has recently included a specific algorithm to predict with
	# the Effnet model (TensorflowPredictEffnetDiscogs -
	# https://github.com/MTG/essentia/commit/919819a4708aca8f7b7156126ecc5c2a095fc9de).
	# However, we also have the choice to infer this model with TensorflowPredictMusiCNN,
	# as we showed in a previous gist https://gist.github.com/xaviliz/f5271c894725af9aaf57cf532f97ea13

	embeddings_model = TensorflowPredictEffnetDiscogs(
	graphFilename=embeddings_model_path,
	input=input_layer,
	output=output_layer,
	patchSize=patch_size,
	patchHopSize=patch_hop_size,
	)

	# Compute embeddings.
	embeddings = embeddings_model(audio)

	# Now we need to load the arousal-valence model and run inference with
	# TensorflowPredict().

	# First we need to configure the input and output layers for this model.
	# This information is available in the json file with model metadata.
	metadata = json.load(open("deam-effnet-discogs-1.json", "r"))

	input_layer = metadata["schema"]["inputs"][0]["name"]
	output_layer = metadata["schema"]["outputs"][0]["name"]

	# Instantiate the arousal-valence model
	# (again, we can instantiate once and then compute on many different inputs).
	av_model = TensorflowPredict(
	graphFilename=av_model_path,
	inputs=[input_layer],
	outputs=[output_layer],
	)

	# Run inference.

	# TensorflowPredict works with an input Essentia Pool. Typically you won't use
	# the TensorFlowPredict algorithm directly with many of our models as we
	# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
	# the case of the new arousal-valence models, so we need to manually prepare
	# the input features.
	feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
	pool = Pool()
	pool.set(input_layer, feature)

	predictions = av_model(pool)[output_layer].squeeze()

	# We can estimate the average of the predictions to get an arousal-valence
	# representation for the entire song.
	print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]


	# An example to infer arousal valence models with audio files in Essentia using the
	# `deam-musicnn-msd-1.pb` model (DEAM dataset, MusiCNN-MSD embeddings).

	# Model files for inference of embeddings and arousal/valence.
	av_model_path = "deam-musicnn-msd-1.pb"
	embeddings_model_path = "msd-musicnn-1.pb"

	# Parameters for the MusiCNN-MSD embeddings model.
	input_layer = "model/Placeholder"
	output_layer = "model/dense/BiasAdd"

	# Instantiate the embeddings model
	# (we can instantiate once and then compute on many different inputs).
	embeddings_model = TensorflowPredictMusiCNN(
	graphFilename=embeddings_model_path,
	input=input_layer,
	output=output_layer,
	)

	# Compute embeddings.
	embeddings = embeddings_model(audio)

	# Now we need to load the arousal-valence model and run inference with
	# TensorflowPredict().

	# First we need to configure the input and output layers for this model.
	# This information is available in the json file with model metadata.
	metadata = json.load(open("deam-musicnn-msd-1.json", "r"))

	input_layer = metadata["schema"]["inputs"][0]["name"]
	output_layer = metadata["schema"]["outputs"][0]["name"]

	# Instantiate the arousal-valence model
	# (again, we can instantiate once and then compute on many different inputs).
	av_model = TensorflowPredict(
	graphFilename=av_model_path,
	inputs=[input_layer],
	outputs=[output_layer],
	)

	# Run inference.

	# TensorflowPredict works with an input Essentia Pool. Typically you won't use
	# the TensorFlowPredict algorithm directly with many of our models as we
	# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
	# the case of the new arousal-valence models, so we need to manually prepare
	# the input features.
	feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
	pool = Pool()
	pool.set(input_layer, feature)

	predictions = av_model(pool)[output_layer].squeeze()

	# We can estimate the average of the predictions to get an arousal-valence
	# representation for the entire song.
	print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]


	# An example to infer arousal valence models with audio files in Essentia using the
	# `deam-vggish-audioset-1.pb` model (DEAM dataset, Effnet-Discogs embeddings).

	# Model files for inference of embeddings and arousal/valence.
	av_model_path = "deam-vggish-audioset-1.pb"
	embeddings_model_path = "audioset-vggish-3.pb"

	# For configuring MusiCNN and VGGish embeddings, see:
	# https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html

	# Parameters for the VGGish embeddings model. It works in time domain,
	# it doesn't needs to specify patch_size and patch_hop_size,
	# only output_layer name.
	output_layer = "model/vggish/embeddings"

	# Instantiate the embeddings model
	# (we can instantiate once and then compute on many different inputs).
	embeddings_model = TensorflowPredictVGGish(
	graphFilename=embeddings_model_path,
	output=output_layer,
	)

	# Compute embeddings.
	embeddings = embeddings_model(audio)

	# Now we need to load the arousal-valence model and run inference with
	# TensorflowPredict().

	# First we need to configure the input and output layers for this model.
	# This information is available in the json file with model metadata.
	metadata = json.load(open("deam-vggish-audioset-1.json", "r"))

	input_layer = metadata["schema"]["inputs"][0]["name"]
	output_layer = metadata["schema"]["outputs"][0]["name"]

	# Instantiate the arousal-valence model
	# (again, we can instantiate once and then compute on many different inputs).
	av_model = TensorflowPredict(
	graphFilename=av_model_path,
	inputs=[input_layer],
	outputs=[output_layer],
	)

	# Run inference.

	# TensorflowPredict works with an input Essentia Pool. Typically you won't use
	# the TensorFlowPredict algorithm directly with many of our models as we
	# provide wrappers (for example, TensorflowPredictMusiCNN). However this is not
	# the case of the new arousal-valence models, so we need to manually prepare
	# the input features.
	feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1])
	pool.set(input_layer, feature)

	predictions = av_model(pool)[output_layer].squeeze()

	# We can estimate the average of the predictions to get an arousal-valence
	# representation for the entire song.
	print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal]
No results found