Last active
March 30, 2025 19:00
-
-
Save xaviliz/775d34afaa2a5746c71310d9556606a8 to your computer and use it in GitHub Desktop.
Some examples of how to infer audio with the transfer learning-based arousal-valence models trained with DEAM dataset in Essentia.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # An example to infer arousal valence models with audio files in Essentia using the | |
| # `deam-effnet-discogs-1.pb` model (DEAM dataset, Effnet-Discogs embeddings). | |
| import json | |
| from essentia import Pool | |
| from essentia.standard import ( | |
| MonoLoader, | |
| TensorflowPredict, | |
| TensorflowPredictEffnetDiscogs, | |
| TensorflowPredictMusiCNN, | |
| TensorflowPredictVGGish, | |
| ) | |
| # Model files for inference of embeddings and arousal/valence. | |
| av_model_path = "deam-effnet-discogs-1.pb" | |
| embeddings_model_path = "effnet-discogs-1.pb" | |
| # Load an audio file. | |
| # The embeddings models work with input audio with the 16 KHz samplerate. | |
| audio_path = "0rVJLlfkMJh6YOWAYmD0v3.mp3" | |
| audio = MonoLoader(filename=audio_path, sampleRate=16000)() | |
| # For configuring MusiCNN and VGGish embeddings, see: | |
| # https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html | |
| # Parameters for the Effnet-Discogs embeddings model. It requires patch size | |
| # and patch hop size different from MusiCNN. | |
| # The patch_size defines the number of melspectrogram frames the model needs | |
| # to return an embedding vector. Effnet model needs 128 frames and | |
| # each frame is extracted with a 256-samples hopsize. | |
| # That's why the receptive field of Effnet is 2-seconds, 128 * 256/ 16000 ≈ 2 seconds, | |
| # and it differs of MusiCNN which utilizes 187 frames (3-seconds). | |
| patch_size = 128 | |
| # The patch_hop_size defines the number of feature frames in between successive embedding. | |
| # It defines the time interval analysed at each embedding vector and it can vary for each model. | |
| # Effnet applies 64 patches, close to 1-seconds (64 * 256 / 16000), | |
| # whereas MusiCNN applies 1.5-seconds (93 * 256/ 16000 ≈ 1.5) | |
| patch_hop_size = patch_size // 2 | |
| # Although Effnet-Discogs was published recently in | |
| # https://essentia.upf.edu/models/music-style-classification/discogs-effnet/ | |
| # deam-effnet-discogs-1 model was trained with a previous version. | |
| # NOTE: The embedding size and the layer names differ among versions. | |
| input_layer = "melspectrogram" | |
| output_layer = "onnx_tf_prefix_BatchNormalization_496/add_1" | |
| # Essentia has recently included a specific algorithm to predict with | |
| # the Effnet model (TensorflowPredictEffnetDiscogs - | |
| # https://github.com/MTG/essentia/commit/919819a4708aca8f7b7156126ecc5c2a095fc9de). | |
| # However, we also have the choice to infer this model with TensorflowPredictMusiCNN, | |
| # as we showed in a previous gist https://gist.github.com/xaviliz/f5271c894725af9aaf57cf532f97ea13 | |
| embeddings_model = TensorflowPredictEffnetDiscogs( | |
| graphFilename=embeddings_model_path, | |
| input=input_layer, | |
| output=output_layer, | |
| patchSize=patch_size, | |
| patchHopSize=patch_hop_size, | |
| ) | |
| # Compute embeddings. | |
| embeddings = embeddings_model(audio) | |
| # Now we need to load the arousal-valence model and run inference with | |
| # TensorflowPredict(). | |
| # First we need to configure the input and output layers for this model. | |
| # This information is available in the json file with model metadata. | |
| metadata = json.load(open("deam-effnet-discogs-1.json", "r")) | |
| input_layer = metadata["schema"]["inputs"][0]["name"] | |
| output_layer = metadata["schema"]["outputs"][0]["name"] | |
| # Instantiate the arousal-valence model | |
| # (again, we can instantiate once and then compute on many different inputs). | |
| av_model = TensorflowPredict( | |
| graphFilename=av_model_path, | |
| inputs=[input_layer], | |
| outputs=[output_layer], | |
| ) | |
| # Run inference. | |
| # TensorflowPredict works with an input Essentia Pool. Typically you won't use | |
| # the TensorFlowPredict algorithm directly with many of our models as we | |
| # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not | |
| # the case of the new arousal-valence models, so we need to manually prepare | |
| # the input features. | |
| feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1]) | |
| pool = Pool() | |
| pool.set(input_layer, feature) | |
| predictions = av_model(pool)[output_layer].squeeze() | |
| # We can estimate the average of the predictions to get an arousal-valence | |
| # representation for the entire song. | |
| print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal] | |
| # An example to infer arousal valence models with audio files in Essentia using the | |
| # `deam-musicnn-msd-1.pb` model (DEAM dataset, MusiCNN-MSD embeddings). | |
| # Model files for inference of embeddings and arousal/valence. | |
| av_model_path = "deam-musicnn-msd-1.pb" | |
| embeddings_model_path = "msd-musicnn-1.pb" | |
| # Parameters for the MusiCNN-MSD embeddings model. | |
| input_layer = "model/Placeholder" | |
| output_layer = "model/dense/BiasAdd" | |
| # Instantiate the embeddings model | |
| # (we can instantiate once and then compute on many different inputs). | |
| embeddings_model = TensorflowPredictMusiCNN( | |
| graphFilename=embeddings_model_path, | |
| input=input_layer, | |
| output=output_layer, | |
| ) | |
| # Compute embeddings. | |
| embeddings = embeddings_model(audio) | |
| # Now we need to load the arousal-valence model and run inference with | |
| # TensorflowPredict(). | |
| # First we need to configure the input and output layers for this model. | |
| # This information is available in the json file with model metadata. | |
| metadata = json.load(open("deam-musicnn-msd-1.json", "r")) | |
| input_layer = metadata["schema"]["inputs"][0]["name"] | |
| output_layer = metadata["schema"]["outputs"][0]["name"] | |
| # Instantiate the arousal-valence model | |
| # (again, we can instantiate once and then compute on many different inputs). | |
| av_model = TensorflowPredict( | |
| graphFilename=av_model_path, | |
| inputs=[input_layer], | |
| outputs=[output_layer], | |
| ) | |
| # Run inference. | |
| # TensorflowPredict works with an input Essentia Pool. Typically you won't use | |
| # the TensorFlowPredict algorithm directly with many of our models as we | |
| # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not | |
| # the case of the new arousal-valence models, so we need to manually prepare | |
| # the input features. | |
| feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1]) | |
| pool = Pool() | |
| pool.set(input_layer, feature) | |
| predictions = av_model(pool)[output_layer].squeeze() | |
| # We can estimate the average of the predictions to get an arousal-valence | |
| # representation for the entire song. | |
| print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal] | |
| # An example to infer arousal valence models with audio files in Essentia using the | |
| # `deam-vggish-audioset-1.pb` model (DEAM dataset, Effnet-Discogs embeddings). | |
| # Model files for inference of embeddings and arousal/valence. | |
| av_model_path = "deam-vggish-audioset-1.pb" | |
| embeddings_model_path = "audioset-vggish-3.pb" | |
| # For configuring MusiCNN and VGGish embeddings, see: | |
| # https://essentia.upf.edu/tutorial_tensorflow_auto-tagging_classification_embeddings.html | |
| # Parameters for the VGGish embeddings model. It works in time domain, | |
| # it doesn't needs to specify patch_size and patch_hop_size, | |
| # only output_layer name. | |
| output_layer = "model/vggish/embeddings" | |
| # Instantiate the embeddings model | |
| # (we can instantiate once and then compute on many different inputs). | |
| embeddings_model = TensorflowPredictVGGish( | |
| graphFilename=embeddings_model_path, | |
| output=output_layer, | |
| ) | |
| # Compute embeddings. | |
| embeddings = embeddings_model(audio) | |
| # Now we need to load the arousal-valence model and run inference with | |
| # TensorflowPredict(). | |
| # First we need to configure the input and output layers for this model. | |
| # This information is available in the json file with model metadata. | |
| metadata = json.load(open("deam-vggish-audioset-1.json", "r")) | |
| input_layer = metadata["schema"]["inputs"][0]["name"] | |
| output_layer = metadata["schema"]["outputs"][0]["name"] | |
| # Instantiate the arousal-valence model | |
| # (again, we can instantiate once and then compute on many different inputs). | |
| av_model = TensorflowPredict( | |
| graphFilename=av_model_path, | |
| inputs=[input_layer], | |
| outputs=[output_layer], | |
| ) | |
| # Run inference. | |
| # TensorflowPredict works with an input Essentia Pool. Typically you won't use | |
| # the TensorFlowPredict algorithm directly with many of our models as we | |
| # provide wrappers (for example, TensorflowPredictMusiCNN). However this is not | |
| # the case of the new arousal-valence models, so we need to manually prepare | |
| # the input features. | |
| feature = embeddings.reshape(-1, 1, 1, embeddings.shape[1]) | |
| pool.set(input_layer, feature) | |
| predictions = av_model(pool)[output_layer].squeeze() | |
| # We can estimate the average of the predictions to get an arousal-valence | |
| # representation for the entire song. | |
| print(f"prediction: {predictions.mean(axis=0)}") # [valence, arousal] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment