Обнаружение BPM от pyAudioAnalysis производит неправильное количество ударов для любого сигнала - PullRequest
2 голосов
/ 04 апреля 2020

Любая помощь будет высоко ценится. Я пытаюсь извлечь BPM из любого файла .wav, который загружается в сценарий python, с помощью библиотеки pyAudioAnalysis. По какой-то причине это не выводит правильный BPM? Я пытался изменить размер окна в функции beat_extraction (), но он разрешает только числа менее 1 секунды, и когда я изменяю размер окна, кажется, что BPM меняется. Но если держать его в окне длительностью 1 секунда, он каждый раз выдает 30 символов.

Ниже приведен мой код:

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures
from pyAudioAnalysis import MidTermFeatures
import matplotlib.pyplot as plt
import numpy as np
import warnings

file_name = "unity_alan.wav"

# Extract the Sampling Rate (Fs) and the Raw Signal Data (signal)
[Fs, signal] = audioBasicIO.read_audio_file(file_name)

# Uncomment if signal has two channels
# signal = signal[:,0]

# Function to Normalize the Signal
def normalize_signal(signal):
  signal = np.double(signal)
  return (signal - signal.mean()) / ((np.abs(signal)).max() + 0.0000000001)

# Short Term Features
# For each short-term window a set of features is extracted. This would result in a sequence of feature
# vectors stored in a np matrix (in this case Features_midTerm)

signal = normalize_signal(signal)

# Fs - frequency sampling Rate
print("The Sampling Freq: ",Fs)

# Total Signal Len
signal_len = len(signal)
print("Total Signal Length: ",signal_len)

# Total Time Len of Song in Seconds
len_of_song = signal_len / Fs
print("Total Song Time (s): ",len_of_song)

# Window Size in mseconds
windowSize_in_ms = 50
windowSize_in_s = windowSize_in_ms/1000
# Window Size in Samples
windowSize_in_samples = Fs * (windowSize_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Size (samples): ",windowSize_in_samples)

# Window Step in mseconds
wStep_in_ms = 25
# Window Step in Samples
wStep_in_samples = Fs * (wStep_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Step in Samples: ", wStep_in_samples)

# Oversampling Percentage
oversampling_Percentage = (wStep_in_ms / windowSize_in_ms) * 100
print("Oversampling Percentage (overlap of windows): ",oversampling_Percentage)

# Total Number of Windows in Signal
total_number_windows = signal_len / windowSize_in_samples
print("Total Number of Windows in Signal: ",total_number_windows)

# Total number of feature samples produced (windows/size * total windows)
feature_samples_points_total_cal = int(total_number_windows * (windowSize_in_ms/wStep_in_ms))
print("Calculated Total of Points Produced per Short Term Feature: ",feature_samples_points_total_cal)

# Extract features and their names. Each index has its own vector of features. The total number should be the same
# as the calculated total of points produced per feature.
Features_shortTerm, feature_names = ShortTermFeatures.feature_extraction(signal, Fs, windowSize_in_samples, wStep_in_samples)

# Exact Number of points in the Features
feature_samples_points_total_exact = len(Features_shortTerm[0])
print("Exact Total of Points Produced per Short Term Feature: ",feature_samples_points_total_exact)

# Mid-window (in seconds)
mid_window_seconds = int(1 * Fs)

# Mid-step (in seconds)
mid_step_seconds = int(1 * Fs)

# MID FEATURE Extraction
Features_midTerm, short_Features_ignore, mid_feature_names = MidTermFeatures.mid_feature_extraction(signal,Fs,mid_window_seconds,mid_step_seconds,windowSize_in_samples,wStep_in_samples)

# Exact Mid-Term Feature Total Number of Points
midTerm_features_total_points = len(Features_midTerm)
print("Exact Mid-Term Total Number of Feature Points: ",midTerm_features_total_points)

# Beats per min
# The Tempo of music determins the speed at which it is played (measured in BPM)
bpm,confidence_ratio = MidTermFeatures.beat_extraction(Features_shortTerm,1)
print("Beats per Minute (bpm): ",bpm)
print("Confidence ratio for BPM: ", confidence_ratio)

# Figure out why the BPM does not match the actual reading
# of 115 BPM. It is showing 30 BPM which is for sure wrong.

Вывод моего сценария выглядит следующим образом:

The Sampling Freq:  48000
Total Signal Length:  10992884
Total Song Time (s):  229.01841666666667
Window Size (samples):  2400.0
Window Step in Samples:  1200.0
Oversampling Percentage (overlap of windows):  50.0
Total Number of Windows in Signal:  4580.368333333333
Calculated Total of Points Produced per Short Term Feature:  9160
Exact Total of Points Produced per Short Term Feature:  9159
Exact Mid-Term Total Number of Feature Points:  136
Beats per Minute (bpm):  30.0
Confidence ratio for BPM:  1.0

Библиотека по определению функции выглядит следующим образом:

def beat_extraction(short_features, window_size, plot=False):
    """
    This function extracts an estimate of the beat rate for a musical signal.
    ARGUMENTS:
     - short_features:     a np array (n_feats x numOfShortTermWindows)
     - window_size:        window size in seconds
    RETURNS:
     - bpm:            estimates of beats per minute
     - ratio:          a confidence measure
    """

    # Features that are related to the beat tracking task:
    selected_features = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10,
                         11, 12, 13, 14, 15, 16, 17, 18]

    max_beat_time = int(round(2.0 / window_size))
    hist_all = np.zeros((max_beat_time,))
    # for each feature
    for ii, i in enumerate(selected_features):
        # dif threshold (3 x Mean of Difs)
        dif_threshold = 2.0 * (np.abs(short_features[i, 0:-1] -
                                      short_features[i, 1::])).mean()
        if dif_threshold <= 0:
            dif_threshold = 0.0000000000000001
        # detect local maxima
        [pos1, _] = utilities.peakdet(short_features[i, :], dif_threshold)
        position_diffs = []
        # compute histograms of local maxima changes
        for j in range(len(pos1)-1):
            position_diffs.append(pos1[j+1]-pos1[j])
        histogram_times, histogram_edges = \
            np.histogram(position_diffs, np.arange(0.5, max_beat_time + 1.5))
        hist_centers = (histogram_edges[0:-1] + histogram_edges[1::]) / 2.0
        histogram_times = \
            histogram_times.astype(float) / short_features.shape[1]
        hist_all += histogram_times
        if plot:
            plt.subplot(9, 2, ii + 1)
            plt.plot(short_features[i, :], 'k')
            for k in pos1:
                plt.plot(k, short_features[i, k], 'k*')
            f1 = plt.gca()
            f1.axes.get_xaxis().set_ticks([])
            f1.axes.get_yaxis().set_ticks([])

    if plot:
        plt.show(block=False)
        plt.figure()

    # Get beat as the argmax of the agregated histogram:
    max_indices = np.argmax(hist_all)
    bpms = 60 / (hist_centers * window_size)
    bpm = bpms[max_indices]
    # ... and the beat ratio:
    ratio = hist_all[max_indices] / hist_all.sum()

    if plot:
        # filter out >500 beats from plotting:
        hist_all = hist_all[bpms < 500]
        bpms = bpms[bpms < 500]

        plt.plot(bpms, hist_all, 'k')
        plt.xlabel('Beats per minute')
        plt.ylabel('Freq Count')
        plt.show(block=True)

    return bpm, ratio
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...