Analysing Attack Times

Python. I used the spectrogram to detect attack times in a single line melody by shaping signal’s power graph. It works better with only slow passages in which the notes are separable from each other. I wrote its algorithms by taking reference the signal’s total power. Basically, it reshapes the total power by a smoothing algorithm, and finds the attack times for each note based on a threshold value. One of the bottlenecks is that reshaping algorithm might fail for fast passages and/or deciding threshold value could be unclear. I will improve the reshaping strategy later on.

import os
import numpy as np
import scipy.io.wavfile as wavfile
import matplotlib.pyplot as plt
from scipy.signal import spectrogram

def analyseAttackTimes(fileName):
    
    # fs: sampling rate of the wav file, x: integer point array
        
    if (os.path.isfile(fileName) == False):					
        raise ValueError("Input file is not valid")
            
    fs, x = wavfile.read(fileName)
 
    # We only accept mono and 44.1khz audio files for this implementation  
    
    if (len(x.shape) != 1):                                  
            raise ValueError("Audio file is not mono")
    if (fs != 44100):                                        
            raise ValueError("Sampling rate of input sound is not 44100")

    # Get the total power, normalize it in the range [0, 1], and reshape     
    f, T, P = spectrogram(x, fs, nperseg=512)
    
    totalPower = np.sum(P, axis=0) 
    normTotalPower = totalPower / np.max(totalPower)
    reshaped = reshapeSignal(normTotalPower.copy()) 
    
    # Calculate the attack times
    timeIndexes = findAttackTimeIndexes(reshaped)
    
    sigDuration = len(x)/fs
    attackTimes = []
    
    for m in timeIndexes:
        attackTimes.append((sigDuration/len(T)) * m)   
       
    # Create subplots and display them
    fig, axs = plt.subplots(2, 1, figsize=(10, 8), dpi=300, sharex=True)
    time_axis = np.linspace(0, len(x) / fs, len(totalPower))
    
    plt.subplot(2, 1, 1)
    plt.plot(time_axis, normTotalPower, color='blue')
    plt.text(0.95, 0.95, 'Total Power', ha='right', va='top', fontsize=15, transform=plt.gca().transAxes)
    
    plt.subplot(2, 1, 2)
    plt.plot(time_axis, reshaped, color='red')
    plt.xlabel('Time [sec]')
    plt.text(0.95, 0.95, 'Reshaped Total Power', ha='right', va='top', fontsize=15, transform=plt.gca().transAxes)
    
    plt.tight_layout()
    plt.show()

    return attackTimes

def reshapeSignal(signal, cycle = 2):
    for i in range(0, cycle):
        for j in range(2, len(signal)-1):
            signal[j] = (signal[j-1] + signal[j+1]) / 2
        
    return signal

def findAttackTimeIndexes(signal, thresholdValue = 0.3):
    threshold = thresholdValue * np.max(signal)
    timeIndexes = []
    isAttackTime = False
    
    for i in range(0, len(signal)):
        if((signal[i] >= threshold) and not isAttackTime):
            timeIndexes.append(i)
            isAttackTime = True 
        elif((signal[i] < threshold) and isAttackTime):
            isAttackTime = False              

    return timeIndexes

if __name__ == "__main__":
    result = analyseAttackTimes('C:/Users/Hakan/Desktop/Sounds/piano.wav')
    print(result)

import os

import numpy as np

import scipy.io.wavfile as wavfile

import matplotlib.pyplot as plt

from scipy.signal import spectrogram

def analyseAttackTimes(fileName):

# fs: sampling rate of the wav file, x: integer point array

if (os.path.isfile(fileName) == False):

raise ValueError("Input file is not valid")

fs, x = wavfile.read(fileName)

# We only accept mono and 44.1khz audio files for this implementation

if (len(x.shape) != 1):

raise ValueError("Audio file is not mono")

if (fs != 44100):

raise ValueError("Sampling rate of input sound is not 44100")

# Get the total power, normalize it in the range [0, 1], and reshape

f, T, P = spectrogram(x, fs, nperseg=512)

totalPower = np.sum(P, axis=0)

normTotalPower = totalPower / np.max(totalPower)

reshaped = reshapeSignal(normTotalPower.copy())

# Calculate the attack times

timeIndexes = findAttackTimeIndexes(reshaped)

sigDuration = len(x)/fs

attackTimes = []

for m in timeIndexes:

attackTimes.append((sigDuration/len(T)) * m)

# Create subplots and display them

fig, axs = plt.subplots(2, 1, figsize=(10, 8), dpi=300, sharex=True)

time_axis = np.linspace(0, len(x) / fs, len(totalPower))

plt.subplot(2, 1, 1)

plt.plot(time_axis, normTotalPower, color='blue')

plt.text(0.95, 0.95, 'Total Power', ha='right', va='top', fontsize=15, transform=plt.gca().transAxes)

plt.subplot(2, 1, 2)

plt.plot(time_axis, reshaped, color='red')

plt.xlabel('Time [sec]')

plt.text(0.95, 0.95, 'Reshaped Total Power', ha='right', va='top', fontsize=15, transform=plt.gca().transAxes)

plt.tight_layout()

plt.show()

return attackTimes

def reshapeSignal(signal, cycle = 2):

for i in range(0, cycle):

for j in range(2, len(signal)-1):

signal[j] = (signal[j-1] + signal[j+1]) / 2

return signal

def findAttackTimeIndexes(signal, thresholdValue = 0.3):

threshold = thresholdValue * np.max(signal)

timeIndexes = []

isAttackTime = False

for i in range(0, len(signal)):

if((signal[i] >= threshold) and not isAttackTime):

timeIndexes.append(i)

isAttackTime = True

elif((signal[i] < threshold) and isAttackTime):

isAttackTime = False

return timeIndexes

if __name__ == "__main__":

result = analyseAttackTimes('C:/Users/Hakan/Desktop/Sounds/piano.wav')

print(result)

For demonstration, I used a simple piano segment. It is a 5 notes single melody line. Its STFT parameter and threshold can be changed. It gives the output the related attack times. For this simple melody line;

0.04 sec
0.85 sec
1.04 sec
1.55 sec
2.08 sec

References:
[1] Mathematics of the Discrete Fourier Transform, Julius O. Smith III

[2] Scipy Signal Spectrogram Documentation