Cosine Similarity
Recently, I experimented with a quick way to measure how similar two audio tracks are — not by comparing raw waveforms, but by looking at their spectral patterns.
Using PyTorch and Torchaudio, I load two tracks (mono, 44.1 kHz, fixed 15 seconds), convert them to Mel-spectrograms, normalize them, and then compute cosine similarity frame-by-frame. This gives me a single average similarity score as well as the option to see how the similarity changes over time.
Cosine similarity measures the angle between two vectors — in this case, the spectral features of each frame. If the angle is small (close to 0°), the vectors point in nearly the same direction, meaning the spectral patterns are alike. A score close to 1.0 means very similar, 0 means unrelated, and negative values mean they differ in an opposing way.
This approach is useful for checking how faithful a processed track is to the original, comparing alternate takes, or quickly spotting differences without having to listen end-to-end.
import torchaudio
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
# PARAMETERS
sr = 44100
duration_sec = 15
# LOAD TRACKS - Converts to mono if needed
def load_audio(path, sr=44100, duration_sec=15):
waveform, sample_rate = torchaudio.load(path)
# Convert to mono if stereo/multi-channel
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample if sample rate is different
if sample_rate != sr:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sr)
waveform = resampler(waveform)
# Trim or pad to fixed duration
samples_needed = sr * duration_sec
if waveform.shape[1] < samples_needed:
pad_len = samples_needed - waveform.shape[1]
waveform = torch.nn.functional.pad(waveform, (0, pad_len))
else:
waveform = waveform[:, :samples_needed]
return waveform
trackA = load_audio("monoTrackA.wav", sr, duration_sec)
trackB = load_audio("monoTrackB.wav", sr, duration_sec)
# COMPUTE MEL-SPECTROGRAMS
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sr, n_fft=2048, hop_length=512, n_mels=128
)
db_transform = torchaudio.transforms.AmplitudeToDB()
mel_trackA = db_transform(mel_transform(trackA))
mel_trackB = db_transform(mel_transform(trackB))
# SHAPE ALIGNMENT
min_len = min(mel_trackA.shape[-1], mel_trackB.shape[-1])
mel_trackA = mel_trackA[..., :min_len]
mel_trackB = mel_trackB[..., :min_len]
# NORMALIZATION (zero mean, unit variance)
mel_trackA = (mel_trackA - mel_trackA.mean()) / mel_trackA.std()
mel_trackB = (mel_trackB - mel_trackB.mean()) / mel_trackB.std()
# COSINE SIMILARITY PER FRAME
cos_vals = F.cosine_similarity(
mel_trackA.transpose(1, 2), # (1, time, n_mels)
mel_trackB.transpose(1, 2), # (1, time, n_mels)
dim=2
)
mean_cos_sim = cos_vals.mean().item()
print(f"Mean cosine similarity over time frames: {mean_cos_sim:.4f}")
# VISUALIZATION
def plot_mel(mel_tensor: torch.Tensor, title: str):
mel = mel_tensor.squeeze(0).detach().cpu() # shape: (n_mels, time_frames)
plt.imshow(mel.numpy(), aspect='auto', origin='lower', cmap='magma')
plt.title(title)
plt.ylabel("Mel bins")
plt.xlabel("Time frames")
plt.colorbar(format="%+2.0f dB")
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plot_mel(mel_trackA, "Track A - Mel Spectrogram")
plt.subplot(1, 2, 2)
plot_mel(mel_trackB, "Track B - Mel Spectrogram")
plt.tight_layout()
plt.show()[1] Cosine similarity