Любая помощь будет высоко ценится. Я пытаюсь извлечь BPM из любого файла .wav, который загружается в сценарий python, с помощью библиотеки pyAudioAnalysis. По какой-то причине это не выводит правильный BPM? Я пытался изменить размер окна в функции beat_extraction (), но он разрешает только числа менее 1 секунды, и когда я изменяю размер окна, кажется, что BPM меняется. Но если держать его в окне длительностью 1 секунда, он каждый раз выдает 30 символов.
Ниже приведен мой код:
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures
from pyAudioAnalysis import MidTermFeatures
import matplotlib.pyplot as plt
import numpy as np
import warnings
file_name = "unity_alan.wav"
# Extract the Sampling Rate (Fs) and the Raw Signal Data (signal)
[Fs, signal] = audioBasicIO.read_audio_file(file_name)
# Uncomment if signal has two channels
# signal = signal[:,0]
# Function to Normalize the Signal
def normalize_signal(signal):
signal = np.double(signal)
return (signal - signal.mean()) / ((np.abs(signal)).max() + 0.0000000001)
# Short Term Features
# For each short-term window a set of features is extracted. This would result in a sequence of feature
# vectors stored in a np matrix (in this case Features_midTerm)
signal = normalize_signal(signal)
# Fs - frequency sampling Rate
print("The Sampling Freq: ",Fs)
# Total Signal Len
signal_len = len(signal)
print("Total Signal Length: ",signal_len)
# Total Time Len of Song in Seconds
len_of_song = signal_len / Fs
print("Total Song Time (s): ",len_of_song)
# Window Size in mseconds
windowSize_in_ms = 50
windowSize_in_s = windowSize_in_ms/1000
# Window Size in Samples
windowSize_in_samples = Fs * (windowSize_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Size (samples): ",windowSize_in_samples)
# Window Step in mseconds
wStep_in_ms = 25
# Window Step in Samples
wStep_in_samples = Fs * (wStep_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Step in Samples: ", wStep_in_samples)
# Oversampling Percentage
oversampling_Percentage = (wStep_in_ms / windowSize_in_ms) * 100
print("Oversampling Percentage (overlap of windows): ",oversampling_Percentage)
# Total Number of Windows in Signal
total_number_windows = signal_len / windowSize_in_samples
print("Total Number of Windows in Signal: ",total_number_windows)
# Total number of feature samples produced (windows/size * total windows)
feature_samples_points_total_cal = int(total_number_windows * (windowSize_in_ms/wStep_in_ms))
print("Calculated Total of Points Produced per Short Term Feature: ",feature_samples_points_total_cal)
# Extract features and their names. Each index has its own vector of features. The total number should be the same
# as the calculated total of points produced per feature.
Features_shortTerm, feature_names = ShortTermFeatures.feature_extraction(signal, Fs, windowSize_in_samples, wStep_in_samples)
# Exact Number of points in the Features
feature_samples_points_total_exact = len(Features_shortTerm[0])
print("Exact Total of Points Produced per Short Term Feature: ",feature_samples_points_total_exact)
# Mid-window (in seconds)
mid_window_seconds = int(1 * Fs)
# Mid-step (in seconds)
mid_step_seconds = int(1 * Fs)
# MID FEATURE Extraction
Features_midTerm, short_Features_ignore, mid_feature_names = MidTermFeatures.mid_feature_extraction(signal,Fs,mid_window_seconds,mid_step_seconds,windowSize_in_samples,wStep_in_samples)
# Exact Mid-Term Feature Total Number of Points
midTerm_features_total_points = len(Features_midTerm)
print("Exact Mid-Term Total Number of Feature Points: ",midTerm_features_total_points)
# Beats per min
# The Tempo of music determins the speed at which it is played (measured in BPM)
bpm,confidence_ratio = MidTermFeatures.beat_extraction(Features_shortTerm,1)
print("Beats per Minute (bpm): ",bpm)
print("Confidence ratio for BPM: ", confidence_ratio)
# Figure out why the BPM does not match the actual reading
# of 115 BPM. It is showing 30 BPM which is for sure wrong.
Вывод моего сценария выглядит следующим образом:
The Sampling Freq: 48000
Total Signal Length: 10992884
Total Song Time (s): 229.01841666666667
Window Size (samples): 2400.0
Window Step in Samples: 1200.0
Oversampling Percentage (overlap of windows): 50.0
Total Number of Windows in Signal: 4580.368333333333
Calculated Total of Points Produced per Short Term Feature: 9160
Exact Total of Points Produced per Short Term Feature: 9159
Exact Mid-Term Total Number of Feature Points: 136
Beats per Minute (bpm): 30.0
Confidence ratio for BPM: 1.0
Библиотека по определению функции выглядит следующим образом:
def beat_extraction(short_features, window_size, plot=False):
"""
This function extracts an estimate of the beat rate for a musical signal.
ARGUMENTS:
- short_features: a np array (n_feats x numOfShortTermWindows)
- window_size: window size in seconds
RETURNS:
- bpm: estimates of beats per minute
- ratio: a confidence measure
"""
# Features that are related to the beat tracking task:
selected_features = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18]
max_beat_time = int(round(2.0 / window_size))
hist_all = np.zeros((max_beat_time,))
# for each feature
for ii, i in enumerate(selected_features):
# dif threshold (3 x Mean of Difs)
dif_threshold = 2.0 * (np.abs(short_features[i, 0:-1] -
short_features[i, 1::])).mean()
if dif_threshold <= 0:
dif_threshold = 0.0000000000000001
# detect local maxima
[pos1, _] = utilities.peakdet(short_features[i, :], dif_threshold)
position_diffs = []
# compute histograms of local maxima changes
for j in range(len(pos1)-1):
position_diffs.append(pos1[j+1]-pos1[j])
histogram_times, histogram_edges = \
np.histogram(position_diffs, np.arange(0.5, max_beat_time + 1.5))
hist_centers = (histogram_edges[0:-1] + histogram_edges[1::]) / 2.0
histogram_times = \
histogram_times.astype(float) / short_features.shape[1]
hist_all += histogram_times
if plot:
plt.subplot(9, 2, ii + 1)
plt.plot(short_features[i, :], 'k')
for k in pos1:
plt.plot(k, short_features[i, k], 'k*')
f1 = plt.gca()
f1.axes.get_xaxis().set_ticks([])
f1.axes.get_yaxis().set_ticks([])
if plot:
plt.show(block=False)
plt.figure()
# Get beat as the argmax of the agregated histogram:
max_indices = np.argmax(hist_all)
bpms = 60 / (hist_centers * window_size)
bpm = bpms[max_indices]
# ... and the beat ratio:
ratio = hist_all[max_indices] / hist_all.sum()
if plot:
# filter out >500 beats from plotting:
hist_all = hist_all[bpms < 500]
bpms = bpms[bpms < 500]
plt.plot(bpms, hist_all, 'k')
plt.xlabel('Beats per minute')
plt.ylabel('Freq Count')
plt.show(block=True)
return bpm, ratio