Cosine Similarity
Recently, I experimented with a quick way to measure how similar two audio tracks are — not by comparing raw waveforms, but by looking at their spectral patterns.
Using PyTorch and Torchaudio, I load two tracks (mono, 44.1 kHz, fixed 15 seconds), convert them to Mel-spectrograms, normalize them, and then compute cosine similarity frame-by-frame. This gives me a single average similarity score as well as the option to see how the similarity changes over time.
Cosine similarity measures the angle between two vectors — in this case, the spectral features of each frame. If the angle is small (close to 0°), the vectors point in nearly the same direction, meaning the spectral patterns are alike. A score close to 1.0
means very similar, 0
means unrelated, and negative values mean they differ in an opposing way.
This approach is useful for checking how faithful a processed track is to the original, comparing alternate takes, or quickly spotting differences without having to listen end-to-end.
import torchaudio import torch import torch.nn.functional as F import matplotlib.pyplot as plt # PARAMETERS sr = 44100 duration_sec = 15 # LOAD TRACKS - Converts to mono if needed def load_audio(path, sr=44100, duration_sec=15): waveform, sample_rate = torchaudio.load(path) # Convert to mono if stereo/multi-channel if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) # Resample if sample rate is different if sample_rate != sr: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sr) waveform = resampler(waveform) # Trim or pad to fixed duration samples_needed = sr * duration_sec if waveform.shape[1] < samples_needed: pad_len = samples_needed - waveform.shape[1] waveform = torch.nn.functional.pad(waveform, (0, pad_len)) else: waveform = waveform[:, :samples_needed] return waveform trackA = load_audio("monoTrackA.wav", sr, duration_sec) trackB = load_audio("monoTrackB.wav", sr, duration_sec) # COMPUTE MEL-SPECTROGRAMS mel_transform = torchaudio.transforms.MelSpectrogram( sample_rate=sr, n_fft=2048, hop_length=512, n_mels=128 ) db_transform = torchaudio.transforms.AmplitudeToDB() mel_trackA = db_transform(mel_transform(trackA)) mel_trackB = db_transform(mel_transform(trackB)) # SHAPE ALIGNMENT min_len = min(mel_trackA.shape[-1], mel_trackB.shape[-1]) mel_trackA = mel_trackA[..., :min_len] mel_trackB = mel_trackB[..., :min_len] # NORMALIZATION (zero mean, unit variance) mel_trackA = (mel_trackA - mel_trackA.mean()) / mel_trackA.std() mel_trackB = (mel_trackB - mel_trackB.mean()) / mel_trackB.std() # COSINE SIMILARITY PER FRAME cos_vals = F.cosine_similarity( mel_trackA.transpose(1, 2), # (1, time, n_mels) mel_trackB.transpose(1, 2), # (1, time, n_mels) dim=2 ) mean_cos_sim = cos_vals.mean().item() print(f"Mean cosine similarity over time frames: {mean_cos_sim:.4f}") # VISUALIZATION def plot_mel(mel_tensor: torch.Tensor, title: str): mel = mel_tensor.squeeze(0).detach().cpu() # shape: (n_mels, time_frames) plt.imshow(mel.numpy(), aspect='auto', origin='lower', cmap='magma') plt.title(title) plt.ylabel("Mel bins") plt.xlabel("Time frames") plt.colorbar(format="%+2.0f dB") plt.figure(figsize=(14, 5)) plt.subplot(1, 2, 1) plot_mel(mel_trackA, "Track A - Mel Spectrogram") plt.subplot(1, 2, 2) plot_mel(mel_trackB, "Track B - Mel Spectrogram") plt.tight_layout() plt.show()[1] Cosine similarity