import os
import numpy as np
import librosa
import time
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

2025-08-30 22:01:48.439702: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-30 22:01:48.489181: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-30 22:01:48.489909: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-30 22:01:49.523553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

AUDIO_EXTENSIONS = (".wav", ".flac", ".mp3") 
def load_speaker_files(root_dir):
    data_dict = {}
    for speaker_id in os.listdir(root_dir):
        spk_path = os.path.join(root_dir, speaker_id)
        if not os.path.isdir(spk_path):
            continue
        file_list = []
        for subfolder in os.listdir(spk_path):
            sub_path = os.path.join(spk_path, subfolder)
            if not os.path.isdir(sub_path):
                continue
            for f in os.listdir(sub_path):
                file_path = os.path.join(sub_path, f)
                if os.path.isfile(file_path) and f.lower().endswith(AUDIO_EXTENSIONS):
                    file_list.append(file_path)
        if file_list:
            data_dict[speaker_id] = file_list
    return data_dict

n_mfcc = 40
max_len = 150 
def preprocess_audio(file_path, n_mfcc=n_mfcc, max_len=max_len):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = (mfcc - np.mean(mfcc)) / (np.std(mfcc) + 1e-6)
    mfcc = mfcc.T
    if len(mfcc) < max_len:
        pad_width = max_len - len(mfcc)
        mfcc = np.pad(mfcc, ((0,pad_width),(0,0)), mode='constant')
    else:
        mfcc = mfcc[:max_len,:]
    return mfcc

root_dir = "LibriSpeech/dev-clean"
speaker_files = load_speaker_files(root_dir)

X = []
y = []

for spk, files in speaker_files.items():
    for f in files:
        try:
            mfcc = preprocess_audio(f)
            X.append(mfcc)
            y.append(spk)
        except Exception as e:
            print(f"Error processing {f}: {e}")

X = np.array(X)
y = np.array(y)

le = LabelEncoder()
y_enc = le.fit_transform(y)

X = X[..., np.newaxis]

X_train, X_val, y_train, y_val = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

num_speakers = len(le.classes_)
input_shape = (max_len, n_mfcc, 1)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_speakers, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_2 (Conv2D)           (None, 150, 40, 32)       320       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 75, 20, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_3 (Conv2D)           (None, 75, 20, 64)        18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 37, 10, 64)        0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 23680)             0         
                                                                 
 dense_2 (Dense)             (None, 64)                1515584   
                                                                 
 dense_3 (Dense)             (None, 40)                2600      
                                                                 
=================================================================
Total params: 1537000 (5.86 MB)
Trainable params: 1537000 (5.86 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=8
)

Epoch 1/10
271/271 [==============================] - 13s 49ms/step - loss: 0.0530 - accuracy: 0.9852 - val_loss: 0.8138 - val_accuracy: 0.8096
Epoch 2/10
271/271 [==============================] - 13s 49ms/step - loss: 0.1008 - accuracy: 0.9722 - val_loss: 0.5943 - val_accuracy: 0.8466
Epoch 3/10
271/271 [==============================] - 13s 48ms/step - loss: 0.0281 - accuracy: 0.9921 - val_loss: 0.6148 - val_accuracy: 0.8614
Epoch 4/10
271/271 [==============================] - 13s 48ms/step - loss: 0.0073 - accuracy: 0.9995 - val_loss: 0.5088 - val_accuracy: 0.8872
Epoch 5/10
271/271 [==============================] - 13s 49ms/step - loss: 0.0013 - accuracy: 1.0000 - val_loss: 0.5275 - val_accuracy: 0.8891
Epoch 6/10
271/271 [==============================] - 13s 49ms/step - loss: 6.6202e-04 - accuracy: 1.0000 - val_loss: 0.5284 - val_accuracy: 0.8891
Epoch 7/10
271/271 [==============================] - 13s 49ms/step - loss: 4.9834e-04 - accuracy: 1.0000 - val_loss: 0.5452 - val_accuracy: 0.8909
Epoch 8/10
271/271 [==============================] - 13s 49ms/step - loss: 3.7242e-04 - accuracy: 1.0000 - val_loss: 0.5536 - val_accuracy: 0.8909
Epoch 9/10
271/271 [==============================] - 13s 50ms/step - loss: 2.9621e-04 - accuracy: 1.0000 - val_loss: 0.5568 - val_accuracy: 0.8854
Epoch 10/10
271/271 [==============================] - 13s 49ms/step - loss: 2.2531e-04 - accuracy: 1.0000 - val_loss: 0.5716 - val_accuracy: 0.8891

model.save("speaker_model.h5")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
with open("speaker_id.tflite", "wb") as f:
    f.write(tflite_model)

/home/kalekale/docs/projects/irl/speakerdet/.venv/lib/python3.9/site-packages/keras/src/engine/training.py:3000: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.
  saving_api.save_model(

INFO:tensorflow:Assets written to: /tmp/tmpchilkq2e/assets

INFO:tensorflow:Assets written to: /tmp/tmpchilkq2e/assets
2025-08-30 22:11:36.184941: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2025-08-30 22:11:36.184968: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-08-30 22:11:36.185136: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpchilkq2e
2025-08-30 22:11:36.186474: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2025-08-30 22:11:36.186488: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpchilkq2e
2025-08-30 22:11:36.189900: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2025-08-30 22:11:36.240650: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpchilkq2e
2025-08-30 22:11:36.254593: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 69456 microseconds.

!tensorflowjs_converter --input_format=keras speaker_model.h5 tfjs_model/

2025-08-30 22:11:41.249041: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
/home/kalekale/docs/projects/irl/speakerdet/.venv/lib/python3.9/site-packages/tensorflow_hub/__init__.py:61: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  from pkg_resources import parse_version