Neural Audio Fingerprinting
This is a basic implementation of neural audio fingerprinting using PyTorch and torchaudio. It serves as a solid foundation for tasks like music identification, audio search, and deep audio retrieval. CNN architecture, sample rate, and input duration can be adjusted for experimentation or improved performance. Fingerprint comparison can be done using cosine similarity or Euclidean distance, depending on the application.
Audio loading and pre-processing (resampling, fixed-length cropping or padding)
Mel spectrogram extraction as input features
A simple convolutional neural network (CNN) to generate compact embeddings
L2 normalization for stable, consistent fingerprint representation
If you’d like to support my audio content, you can find me here:
https://buymeacoffee.com/hakanyurdakul
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as t
import matplotlib.pyplot as plt
SAMPLE_RATE = 16000
DURATION = 5
NUM_SAMPLES = SAMPLE_RATE * DURATION
class AudioFingerprintNet(nn.Module):
def __init__(self, embedding_dim=128):
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
nn.AdaptiveAvgPool2d((1, 1))
)
self.fc = nn.Linear(64, embedding_dim)
def forward(self, x):
x = self.conv(x).view(x.size(0), -1)
x = self.fc(x)
return x / x.norm(p=2, dim=1, keepdim=True) # L2 normalize
def load_audio(input_path):
waveform, sr = torchaudio.load(input_path, format="wav")
if sr != SAMPLE_RATE:
waveform = t.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform)
if waveform.size(1) > NUM_SAMPLES:
waveform = waveform[:, :NUM_SAMPLES]
else:
waveform = nn.functional.pad(waveform, (0, NUM_SAMPLES - waveform.size(1)))
return waveform
def get_mel(waveform):
mel = t.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=64)(waveform)
mel_db = t.AmplitudeToDB()(mel)
return mel_db.unsqueeze(0) # add batch dim
def generate_fingerprint(input_path):
waveform = load_audio(input_path)
mel = get_mel(waveform)
model = AudioFingerprintNet()
model.eval()
with torch.no_grad():
embedding = model(mel)
return embedding.squeeze().numpy()
if __name__ == "__main__":
fingerprint = generate_fingerprint("yourfile.wav")
print(fingerprint.shape) # (128,)
plt.plot(fingerprint)
plt.title("Audio Fingerprint Vector")
plt.xlabel("Dimension Index")
plt.ylabel("Value")
plt.show()[1] Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification
[2] Neural Audio Fingerprint for High-specific Audio Retrieval Based on Contrastive Learning
[3] Music Identification with Audio Fingerprinting. An Industrial Perspective