In [1]:
import os
import numpy as np
import librosa
import time
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
2025-08-30 22:01:48.439702: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. 2025-08-30 22:01:48.489181: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. 2025-08-30 22:01:48.489909: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2025-08-30 22:01:49.523553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
AUDIO_EXTENSIONS = (".wav", ".flac", ".mp3")
def load_speaker_files(root_dir):
data_dict = {}
for speaker_id in os.listdir(root_dir):
spk_path = os.path.join(root_dir, speaker_id)
if not os.path.isdir(spk_path):
continue
file_list = []
for subfolder in os.listdir(spk_path):
sub_path = os.path.join(spk_path, subfolder)
if not os.path.isdir(sub_path):
continue
for f in os.listdir(sub_path):
file_path = os.path.join(sub_path, f)
if os.path.isfile(file_path) and f.lower().endswith(AUDIO_EXTENSIONS):
file_list.append(file_path)
if file_list:
data_dict[speaker_id] = file_list
return data_dict
In [3]:
n_mfcc = 40
max_len = 150
def preprocess_audio(file_path, n_mfcc=n_mfcc, max_len=max_len):
y, sr = librosa.load(file_path, sr=16000)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
mfcc = (mfcc - np.mean(mfcc)) / (np.std(mfcc) + 1e-6)
mfcc = mfcc.T
if len(mfcc) < max_len:
pad_width = max_len - len(mfcc)
mfcc = np.pad(mfcc, ((0,pad_width),(0,0)), mode='constant')
else:
mfcc = mfcc[:max_len,:]
return mfcc
In [4]:
root_dir = "LibriSpeech/dev-clean"
speaker_files = load_speaker_files(root_dir)
X = []
y = []
for spk, files in speaker_files.items():
for f in files:
try:
mfcc = preprocess_audio(f)
X.append(mfcc)
y.append(spk)
except Exception as e:
print(f"Error processing {f}: {e}")
X = np.array(X)
y = np.array(y)
In [5]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
In [6]:
X = X[..., np.newaxis]
In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
In [8]:
num_speakers = len(le.classes_)
input_shape = (max_len, n_mfcc, 1)
In [14]:
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=input_shape),
tf.keras.layers.Conv2D(32, (3,3), activation='relu', padding='same'),
tf.keras.layers.MaxPooling2D((2,2)),
tf.keras.layers.Conv2D(64, (3,3), activation='relu', padding='same'),
tf.keras.layers.MaxPooling2D((2,2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(num_speakers, activation='softmax')
])
In [15]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d_2 (Conv2D) (None, 150, 40, 32) 320 max_pooling2d_2 (MaxPoolin (None, 75, 20, 32) 0 g2D) conv2d_3 (Conv2D) (None, 75, 20, 64) 18496 max_pooling2d_3 (MaxPoolin (None, 37, 10, 64) 0 g2D) flatten_1 (Flatten) (None, 23680) 0 dense_2 (Dense) (None, 64) 1515584 dense_3 (Dense) (None, 40) 2600 ================================================================= Total params: 1537000 (5.86 MB) Trainable params: 1537000 (5.86 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________
In [17]:
history = model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=10,
batch_size=8
)
Epoch 1/10 271/271 [==============================] - 13s 49ms/step - loss: 0.0530 - accuracy: 0.9852 - val_loss: 0.8138 - val_accuracy: 0.8096 Epoch 2/10 271/271 [==============================] - 13s 49ms/step - loss: 0.1008 - accuracy: 0.9722 - val_loss: 0.5943 - val_accuracy: 0.8466 Epoch 3/10 271/271 [==============================] - 13s 48ms/step - loss: 0.0281 - accuracy: 0.9921 - val_loss: 0.6148 - val_accuracy: 0.8614 Epoch 4/10 271/271 [==============================] - 13s 48ms/step - loss: 0.0073 - accuracy: 0.9995 - val_loss: 0.5088 - val_accuracy: 0.8872 Epoch 5/10 271/271 [==============================] - 13s 49ms/step - loss: 0.0013 - accuracy: 1.0000 - val_loss: 0.5275 - val_accuracy: 0.8891 Epoch 6/10 271/271 [==============================] - 13s 49ms/step - loss: 6.6202e-04 - accuracy: 1.0000 - val_loss: 0.5284 - val_accuracy: 0.8891 Epoch 7/10 271/271 [==============================] - 13s 49ms/step - loss: 4.9834e-04 - accuracy: 1.0000 - val_loss: 0.5452 - val_accuracy: 0.8909 Epoch 8/10 271/271 [==============================] - 13s 49ms/step - loss: 3.7242e-04 - accuracy: 1.0000 - val_loss: 0.5536 - val_accuracy: 0.8909 Epoch 9/10 271/271 [==============================] - 13s 50ms/step - loss: 2.9621e-04 - accuracy: 1.0000 - val_loss: 0.5568 - val_accuracy: 0.8854 Epoch 10/10 271/271 [==============================] - 13s 49ms/step - loss: 2.2531e-04 - accuracy: 1.0000 - val_loss: 0.5716 - val_accuracy: 0.8891
In [18]:
model.save("speaker_model.h5")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
with open("speaker_id.tflite", "wb") as f:
f.write(tflite_model)
/home/kalekale/docs/projects/irl/speakerdet/.venv/lib/python3.9/site-packages/keras/src/engine/training.py:3000: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`. saving_api.save_model(
INFO:tensorflow:Assets written to: /tmp/tmpchilkq2e/assets
INFO:tensorflow:Assets written to: /tmp/tmpchilkq2e/assets 2025-08-30 22:11:36.184941: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format. 2025-08-30 22:11:36.184968: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency. 2025-08-30 22:11:36.185136: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpchilkq2e 2025-08-30 22:11:36.186474: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve } 2025-08-30 22:11:36.186488: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpchilkq2e 2025-08-30 22:11:36.189900: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle. 2025-08-30 22:11:36.240650: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpchilkq2e 2025-08-30 22:11:36.254593: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 69456 microseconds.
In [19]:
!tensorflowjs_converter --input_format=keras speaker_model.h5 tfjs_model/
2025-08-30 22:11:41.249041: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT /home/kalekale/docs/projects/irl/speakerdet/.venv/lib/python3.9/site-packages/tensorflow_hub/__init__.py:61: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. from pkg_resources import parse_version