Cosine Similarity

Recently, I experimented with a quick way to measure how similar two audio tracks are — not by comparing raw waveforms, but by looking at their spectral patterns.

Using PyTorch and Torchaudio, I load two tracks (mono, 44.1 kHz, fixed 15 seconds), convert them to Mel-spectrograms, normalize them, and then compute cosine similarity frame-by-frame. This gives me a single average similarity score as well as the option to see how the similarity changes over time.

Cosine similarity measures the angle between two vectors — in this case, the spectral features of each frame. If the angle is small (close to 0°), the vectors point in nearly the same direction, meaning the spectral patterns are alike. A score close to 1.0 means very similar, 0 means unrelated, and negative values mean they differ in an opposing way.

This approach is useful for checking how faithful a processed track is to the original, comparing alternate takes, or quickly spotting differences without having to listen end-to-end.

import torchaudio
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# PARAMETERS
sr = 44100
duration_sec = 15

# LOAD TRACKS - Converts to mono if needed
def load_audio(path, sr=44100, duration_sec=15):
    waveform, sample_rate = torchaudio.load(path)

    # Convert to mono if stereo/multi-channel
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if sample rate is different
    if sample_rate != sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sr)
        waveform = resampler(waveform)

    # Trim or pad to fixed duration
    samples_needed = sr * duration_sec
    if waveform.shape[1] < samples_needed:
        pad_len = samples_needed - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, pad_len))
    else:
        waveform = waveform[:, :samples_needed]
    return waveform

trackA = load_audio("monoTrackA.wav", sr, duration_sec)
trackB = load_audio("monoTrackB.wav", sr, duration_sec)

# COMPUTE MEL-SPECTROGRAMS
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=sr, n_fft=2048, hop_length=512, n_mels=128
)
db_transform = torchaudio.transforms.AmplitudeToDB()

mel_trackA = db_transform(mel_transform(trackA))
mel_trackB = db_transform(mel_transform(trackB))

# SHAPE ALIGNMENT
min_len = min(mel_trackA.shape[-1], mel_trackB.shape[-1])
mel_trackA = mel_trackA[..., :min_len]
mel_trackB = mel_trackB[..., :min_len]

# NORMALIZATION (zero mean, unit variance)
mel_trackA = (mel_trackA - mel_trackA.mean()) / mel_trackA.std()
mel_trackB = (mel_trackB - mel_trackB.mean()) / mel_trackB.std()

# COSINE SIMILARITY PER FRAME
cos_vals = F.cosine_similarity(
    mel_trackA.transpose(1, 2),  # (1, time, n_mels)
    mel_trackB.transpose(1, 2),  # (1, time, n_mels)
    dim=2
)
mean_cos_sim = cos_vals.mean().item()
print(f"Mean cosine similarity over time frames: {mean_cos_sim:.4f}")

# VISUALIZATION
def plot_mel(mel_tensor: torch.Tensor, title: str):
    mel = mel_tensor.squeeze(0).detach().cpu()  # shape: (n_mels, time_frames)
    plt.imshow(mel.numpy(), aspect='auto', origin='lower', cmap='magma')
    plt.title(title)
    plt.ylabel("Mel bins")
    plt.xlabel("Time frames")
    plt.colorbar(format="%+2.0f dB")

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plot_mel(mel_trackA, "Track A - Mel Spectrogram")
plt.subplot(1, 2, 2)
plot_mel(mel_trackB, "Track B - Mel Spectrogram")
plt.tight_layout()
plt.show()

[1] Cosine similarity