Вернуть образец распределения массива numpy на основе класса - PullRequest
1 голос
/ 09 марта 2019

Как примечание: я женат на numpy для этой задачи.

Я пытаюсь написать одну функцию, которая выполняет следующие задачи:

  1. Загрузка набора данных вnumpy array
  2. Разбейте набор данных на 5 "равных" (или как можно более одинаковых) сгибов
  3. Для каждого сгиба убедитесь, что данные для обучения и тестирования разделены на 80/20,соответственно
  4. Вот подвох.Исходный входной набор данных «помечен», последний столбец содержит классификацию.Сгибы должны поддерживать то же распределение размера класса, что и у входного набора.

Например, если у меня есть input=100 samples(rows), и есть два класса (обозначаются значением в последнем столбце), A и B, с разбивкой на 33% и 67%, я должен затем создать 5 сгибов, содержащих по 20 образцов каждый, где 6 или 7 образцов - это A, а 13 или 14 образцов - это B.

Я изо всех сил пытаюсь достичь.Я не знаю, как правильно убедиться, что сам FOLD содержит правильное распределение выборки классов.

У меня есть следующие коды для отображения моих попыток до сих пор.До сих пор я написал две функции, которые могут сообщить мне, каково мое распределение для моего входного класса, и способны создать 5 сгибов.Однако мне нужно найти способ объединить их и создать 5 сгибов, которые поддерживают соответствующие распределения.

import numpy

def csv_to_array(file):
    # Open the file, and load it in delimiting on the ',' for a comma separated value file
    data = open(file, 'r')
    data = numpy.loadtxt(data, delimiter=',')

    # Loop through the data in the array
    for index in range(len(data)):
        # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
        try:
            data[index] = [float(x) for x in data[index]]
        except Exception:
            data[index] = 0
        except ValueError:
            data[index] = 0

    # Return the now type-formatted data
    return data


def class_distribution(dataset):
    dataset = numpy.asarray(dataset)
    num_total_rows = dataset.shape[0]
    num_columns = dataset.shape[1]
    classes = dataset[:,num_columns-1]
    classes = numpy.unique(classes)

    for aclass in classes:
        total = 0
        for row in dataset:
            if numpy.array_equal(aclass, row[-1]):
                total = total + 1
            else:
                continue
        print(aclass, " Has: ", ((total/num_total_rows) * 100))
        print(aclass, " : ", total)


def create_folds(dataset):
    # print("DATASET", dataset)
    numpy.random.shuffle(dataset)
    num_rows = dataset.shape[0]
    split_mark = int(num_rows / 5)
    folds = []
    fold_sets = []
    temp1 = dataset[:split_mark]
    # print("TEMP1", temp1)
    temp2 = dataset[split_mark:split_mark*2]
    # print("TEMP2", temp2)
    temp3 = dataset[split_mark*2:split_mark*3]
    # print("TEMP3", temp3)
    temp4 = dataset[split_mark*3:split_mark*4]
    # print("TEMP4", temp4)
    temp5 = dataset[split_mark*4:]
    # print("TEMP5", temp5)
    folds.append(temp1)
    folds.append(temp2)
    folds.append(temp3)
    folds.append(temp4)
    folds.append(temp5)
    folds = numpy.asarray(folds)

    # print(folds)

    return folds


def main():
    print("BEGINNING CFV")
    ecoli = csv_to_array('Classification/ecoli.csv')
    # print(len(ecoli))
    class_distribution(ecoli)
    create_folds(ecoli)

main()

Вот пример csv, с которым я работаю, с последним столбцом, обозначающим класс.Это модификация набора данных ecoli из хранилища машинного обучения UCI:

0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
0.52,0.39,0.48,0.5,0.65,0.71,0.73,1
0.29,0.47,0.48,0.5,0.71,0.65,0.69,1
0.55,0.47,0.48,0.5,0.57,0.78,0.8,1
0.12,0.67,0.48,0.5,0.74,0.58,0.63,1
0.4,0.5,0.48,0.5,0.65,0.82,0.84,1
0.73,0.36,0.48,0.5,0.53,0.91,0.92,1
0.84,0.44,0.48,0.5,0.48,0.71,0.74,1
0.48,0.45,0.48,0.5,0.6,0.78,0.8,1
0.54,0.49,0.48,0.5,0.4,0.87,0.88,1
0.48,0.41,0.48,0.5,0.51,0.9,0.88,1
0.5,0.66,0.48,0.5,0.31,0.92,0.92,1
0.72,0.46,0.48,0.5,0.51,0.66,0.7,1
0.47,0.55,0.48,0.5,0.58,0.71,0.75,1
0.33,0.56,0.48,0.5,0.33,0.78,0.8,1
0.64,0.58,0.48,0.5,0.48,0.78,0.73,1
0.11,0.5,0.48,0.5,0.58,0.72,0.68,1
0.31,0.36,0.48,0.5,0.58,0.94,0.94,1
0.68,0.51,0.48,0.5,0.71,0.75,0.78,1
0.69,0.39,0.48,0.5,0.57,0.76,0.79,1
0.52,0.54,0.48,0.5,0.62,0.76,0.79,1
0.46,0.59,0.48,0.5,0.36,0.76,0.23,1
0.36,0.45,0.48,0.5,0.38,0.79,0.17,1
0,0.51,0.48,0.5,0.35,0.67,0.44,1
0.1,0.49,0.48,0.5,0.41,0.67,0.21,1
0.3,0.51,0.48,0.5,0.42,0.61,0.34,1
0.61,0.47,0.48,0.5,0,0.8,0.32,1
0.63,0.75,0.48,0.5,0.64,0.73,0.66,1
0.71,0.52,0.48,0.5,0.64,1,0.99,1
0.72,0.42,0.48,0.5,0.65,0.77,0.79,2
0.79,0.41,0.48,0.5,0.66,0.81,0.83,2
0.83,0.48,0.48,0.5,0.65,0.76,0.79,2
0.69,0.43,0.48,0.5,0.59,0.74,0.77,2
0.79,0.36,0.48,0.5,0.46,0.82,0.7,2
0.78,0.33,0.48,0.5,0.57,0.77,0.79,2
0.75,0.37,0.48,0.5,0.64,0.7,0.74,2
0.59,0.29,0.48,0.5,0.64,0.75,0.77,2
0.67,0.37,0.48,0.5,0.54,0.64,0.68,2
0.66,0.48,0.48,0.5,0.54,0.7,0.74,2
0.64,0.46,0.48,0.5,0.48,0.73,0.76,2
0.76,0.71,0.48,0.5,0.5,0.71,0.75,2
0.84,0.49,0.48,0.5,0.55,0.78,0.74,2
0.77,0.55,0.48,0.5,0.51,0.78,0.74,2
0.81,0.44,0.48,0.5,0.42,0.67,0.68,2
0.58,0.6,0.48,0.5,0.59,0.73,0.76,2
0.63,0.42,0.48,0.5,0.48,0.77,0.8,2
0.62,0.42,0.48,0.5,0.58,0.79,0.81,2
0.86,0.39,0.48,0.5,0.59,0.89,0.9,2
0.81,0.53,0.48,0.5,0.57,0.87,0.88,2
0.87,0.49,0.48,0.5,0.61,0.76,0.79,2
0.47,0.46,0.48,0.5,0.62,0.74,0.77,2
0.76,0.41,0.48,0.5,0.5,0.59,0.62,2
0.7,0.53,0.48,0.5,0.7,0.86,0.87,2
0.64,0.45,0.48,0.5,0.67,0.61,0.66,2
0.81,0.52,0.48,0.5,0.57,0.78,0.8,2
0.73,0.26,0.48,0.5,0.57,0.75,0.78,2
0.49,0.61,1,0.5,0.56,0.71,0.74,2
0.88,0.42,0.48,0.5,0.52,0.73,0.75,2
0.84,0.54,0.48,0.5,0.75,0.92,0.7,2
0.63,0.51,0.48,0.5,0.64,0.72,0.76,2
0.86,0.55,0.48,0.5,0.63,0.81,0.83,2
0.79,0.54,0.48,0.5,0.5,0.66,0.68,2
0.57,0.38,0.48,0.5,0.06,0.49,0.33,2
0.78,0.44,0.48,0.5,0.45,0.73,0.68,2
0.78,0.68,0.48,0.5,0.83,0.4,0.29,3
0.63,0.69,0.48,0.5,0.65,0.41,0.28,3
0.67,0.88,0.48,0.5,0.73,0.5,0.25,3
0.61,0.75,0.48,0.5,0.51,0.33,0.33,3
0.67,0.84,0.48,0.5,0.74,0.54,0.37,3
0.74,0.9,0.48,0.5,0.57,0.53,0.29,3
0.73,0.84,0.48,0.5,0.86,0.58,0.29,3
0.75,0.76,0.48,0.5,0.83,0.57,0.3,3
0.77,0.57,0.48,0.5,0.88,0.53,0.2,3
0.74,0.78,0.48,0.5,0.75,0.54,0.15,3
0.68,0.76,0.48,0.5,0.84,0.45,0.27,3
0.56,0.68,0.48,0.5,0.77,0.36,0.45,3
0.65,0.51,0.48,0.5,0.66,0.54,0.33,3
0.52,0.81,0.48,0.5,0.72,0.38,0.38,3
0.64,0.57,0.48,0.5,0.7,0.33,0.26,3
0.6,0.76,1,0.5,0.77,0.59,0.52,3
0.69,0.59,0.48,0.5,0.77,0.39,0.21,3
0.63,0.49,0.48,0.5,0.79,0.45,0.28,3
0.71,0.71,0.48,0.5,0.68,0.43,0.36,3
0.68,0.63,0.48,0.5,0.73,0.4,0.3,3
0.74,0.49,0.48,0.5,0.42,0.54,0.36,4
0.7,0.61,0.48,0.5,0.56,0.52,0.43,4
0.66,0.86,0.48,0.5,0.34,0.41,0.36,4
0.73,0.78,0.48,0.5,0.58,0.51,0.31,4
0.65,0.57,0.48,0.5,0.47,0.47,0.51,4
0.72,0.86,0.48,0.5,0.17,0.55,0.21,4
0.67,0.7,0.48,0.5,0.46,0.45,0.33,4
0.67,0.81,0.48,0.5,0.54,0.49,0.23,4
0.67,0.61,0.48,0.5,0.51,0.37,0.38,4
0.63,1,0.48,0.5,0.35,0.51,0.49,4
0.57,0.59,0.48,0.5,0.39,0.47,0.33,4
0.71,0.71,0.48,0.5,0.4,0.54,0.39,4
0.66,0.74,0.48,0.5,0.31,0.38,0.43,4
0.67,0.81,0.48,0.5,0.25,0.42,0.25,4
0.64,0.72,0.48,0.5,0.49,0.42,0.19,4
0.68,0.82,0.48,0.5,0.38,0.65,0.56,4
0.32,0.39,0.48,0.5,0.53,0.28,0.38,4
0.7,0.64,0.48,0.5,0.47,0.51,0.47,4
0.63,0.57,0.48,0.5,0.49,0.7,0.2,4
0.69,0.65,0.48,0.5,0.63,0.48,0.41,4
0.43,0.59,0.48,0.5,0.52,0.49,0.56,4
0.74,0.56,0.48,0.5,0.47,0.68,0.3,4
0.71,0.57,0.48,0.5,0.48,0.35,0.32,4
0.61,0.6,0.48,0.5,0.44,0.39,0.38,4
0.59,0.61,0.48,0.5,0.42,0.42,0.37,4
0.74,0.74,0.48,0.5,0.31,0.53,0.52,4

1 Ответ

0 голосов
/ 29 апреля 2019

Получив совет @AlexL, я посмотрел код StraifiedKFold и разработал модифицированную версию со следующими двумя функциями:

# This function returns the list of classes, and their associated weights (i.e. distributions)
def class_distribution(dataset):
    dataset = numpy.asarray(dataset)
    num_total_rows = dataset.shape[0]
    num_columns = dataset.shape[1]
    classes = dataset[:, num_columns - 1]
    classes = numpy.unique(classes)
    class_weights = []

    # Loop through the classes one by one
    for aclass in classes:
        total = 0
        weight = 0
        for row in dataset:
            if numpy.array_equal(aclass, row[-1]):
                total = total + 1
            else:
                continue
        weight = float((total / num_total_rows))
        class_weights.append(weight)

    class_weights = numpy.asarray(class_weights)
    return classes, class_weights

# This functions performs k cross fold validation for classification
def cross_fold_validation_classification(dataset, k):
    temp_dataset = numpy.asarray(dataset)
    classes, class_weights = class_distribution(temp_dataset)
    total_num_rows = temp_dataset.shape[0]
    data = numpy.copy(temp_dataset)
    total_fold_array = []

    for _ in range(k):
        curr_fold_array = []

        # Loop through each class and its associated weight
        for a_class, a_class_weight in zip(classes, class_weights):
            numpy.random.shuffle(data)
            num_added = 0
            num_to_add = float((((a_class_weight * total_num_rows)) / k))
            tot = 0
            for row in data:
                curr = row[-1]
                if num_added >= num_to_add:
                    break
                else:
                    if (a_class == curr):
                        curr_fold_array.append(row)
                        num_added = num_added + 1
                        numpy.delete(data, tot)
                tot = tot + 1
        total_fold_array.append(curr_fold_array)

return total_fold_array
...