Как примечание: я женат на numpy для этой задачи.
Я пытаюсь написать одну функцию, которая выполняет следующие задачи:
- Загрузка набора данных вnumpy array
- Разбейте набор данных на 5 "равных" (или как можно более одинаковых) сгибов
- Для каждого сгиба убедитесь, что данные для обучения и тестирования разделены на 80/20,соответственно
- Вот подвох.Исходный входной набор данных «помечен», последний столбец содержит классификацию.Сгибы должны поддерживать то же распределение размера класса, что и у входного набора.
Например, если у меня есть input=100 samples(rows)
, и есть два класса (обозначаются значением в последнем столбце), A и B, с разбивкой на 33% и 67%, я должен затем создать 5 сгибов, содержащих по 20 образцов каждый, где 6 или 7 образцов - это A, а 13 или 14 образцов - это B.
Я изо всех сил пытаюсь достичь.Я не знаю, как правильно убедиться, что сам FOLD содержит правильное распределение выборки классов.
У меня есть следующие коды для отображения моих попыток до сих пор.До сих пор я написал две функции, которые могут сообщить мне, каково мое распределение для моего входного класса, и способны создать 5 сгибов.Однако мне нужно найти способ объединить их и создать 5 сгибов, которые поддерживают соответствующие распределения.
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def class_distribution(dataset):
dataset = numpy.asarray(dataset)
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
classes = dataset[:,num_columns-1]
classes = numpy.unique(classes)
for aclass in classes:
total = 0
for row in dataset:
if numpy.array_equal(aclass, row[-1]):
total = total + 1
else:
continue
print(aclass, " Has: ", ((total/num_total_rows) * 100))
print(aclass, " : ", total)
def create_folds(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
fold_sets = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
folds = numpy.asarray(folds)
# print(folds)
return folds
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
# print(len(ecoli))
class_distribution(ecoli)
create_folds(ecoli)
main()
Вот пример csv, с которым я работаю, с последним столбцом, обозначающим класс.Это модификация набора данных ecoli из хранилища машинного обучения UCI:
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
0.52,0.39,0.48,0.5,0.65,0.71,0.73,1
0.29,0.47,0.48,0.5,0.71,0.65,0.69,1
0.55,0.47,0.48,0.5,0.57,0.78,0.8,1
0.12,0.67,0.48,0.5,0.74,0.58,0.63,1
0.4,0.5,0.48,0.5,0.65,0.82,0.84,1
0.73,0.36,0.48,0.5,0.53,0.91,0.92,1
0.84,0.44,0.48,0.5,0.48,0.71,0.74,1
0.48,0.45,0.48,0.5,0.6,0.78,0.8,1
0.54,0.49,0.48,0.5,0.4,0.87,0.88,1
0.48,0.41,0.48,0.5,0.51,0.9,0.88,1
0.5,0.66,0.48,0.5,0.31,0.92,0.92,1
0.72,0.46,0.48,0.5,0.51,0.66,0.7,1
0.47,0.55,0.48,0.5,0.58,0.71,0.75,1
0.33,0.56,0.48,0.5,0.33,0.78,0.8,1
0.64,0.58,0.48,0.5,0.48,0.78,0.73,1
0.11,0.5,0.48,0.5,0.58,0.72,0.68,1
0.31,0.36,0.48,0.5,0.58,0.94,0.94,1
0.68,0.51,0.48,0.5,0.71,0.75,0.78,1
0.69,0.39,0.48,0.5,0.57,0.76,0.79,1
0.52,0.54,0.48,0.5,0.62,0.76,0.79,1
0.46,0.59,0.48,0.5,0.36,0.76,0.23,1
0.36,0.45,0.48,0.5,0.38,0.79,0.17,1
0,0.51,0.48,0.5,0.35,0.67,0.44,1
0.1,0.49,0.48,0.5,0.41,0.67,0.21,1
0.3,0.51,0.48,0.5,0.42,0.61,0.34,1
0.61,0.47,0.48,0.5,0,0.8,0.32,1
0.63,0.75,0.48,0.5,0.64,0.73,0.66,1
0.71,0.52,0.48,0.5,0.64,1,0.99,1
0.72,0.42,0.48,0.5,0.65,0.77,0.79,2
0.79,0.41,0.48,0.5,0.66,0.81,0.83,2
0.83,0.48,0.48,0.5,0.65,0.76,0.79,2
0.69,0.43,0.48,0.5,0.59,0.74,0.77,2
0.79,0.36,0.48,0.5,0.46,0.82,0.7,2
0.78,0.33,0.48,0.5,0.57,0.77,0.79,2
0.75,0.37,0.48,0.5,0.64,0.7,0.74,2
0.59,0.29,0.48,0.5,0.64,0.75,0.77,2
0.67,0.37,0.48,0.5,0.54,0.64,0.68,2
0.66,0.48,0.48,0.5,0.54,0.7,0.74,2
0.64,0.46,0.48,0.5,0.48,0.73,0.76,2
0.76,0.71,0.48,0.5,0.5,0.71,0.75,2
0.84,0.49,0.48,0.5,0.55,0.78,0.74,2
0.77,0.55,0.48,0.5,0.51,0.78,0.74,2
0.81,0.44,0.48,0.5,0.42,0.67,0.68,2
0.58,0.6,0.48,0.5,0.59,0.73,0.76,2
0.63,0.42,0.48,0.5,0.48,0.77,0.8,2
0.62,0.42,0.48,0.5,0.58,0.79,0.81,2
0.86,0.39,0.48,0.5,0.59,0.89,0.9,2
0.81,0.53,0.48,0.5,0.57,0.87,0.88,2
0.87,0.49,0.48,0.5,0.61,0.76,0.79,2
0.47,0.46,0.48,0.5,0.62,0.74,0.77,2
0.76,0.41,0.48,0.5,0.5,0.59,0.62,2
0.7,0.53,0.48,0.5,0.7,0.86,0.87,2
0.64,0.45,0.48,0.5,0.67,0.61,0.66,2
0.81,0.52,0.48,0.5,0.57,0.78,0.8,2
0.73,0.26,0.48,0.5,0.57,0.75,0.78,2
0.49,0.61,1,0.5,0.56,0.71,0.74,2
0.88,0.42,0.48,0.5,0.52,0.73,0.75,2
0.84,0.54,0.48,0.5,0.75,0.92,0.7,2
0.63,0.51,0.48,0.5,0.64,0.72,0.76,2
0.86,0.55,0.48,0.5,0.63,0.81,0.83,2
0.79,0.54,0.48,0.5,0.5,0.66,0.68,2
0.57,0.38,0.48,0.5,0.06,0.49,0.33,2
0.78,0.44,0.48,0.5,0.45,0.73,0.68,2
0.78,0.68,0.48,0.5,0.83,0.4,0.29,3
0.63,0.69,0.48,0.5,0.65,0.41,0.28,3
0.67,0.88,0.48,0.5,0.73,0.5,0.25,3
0.61,0.75,0.48,0.5,0.51,0.33,0.33,3
0.67,0.84,0.48,0.5,0.74,0.54,0.37,3
0.74,0.9,0.48,0.5,0.57,0.53,0.29,3
0.73,0.84,0.48,0.5,0.86,0.58,0.29,3
0.75,0.76,0.48,0.5,0.83,0.57,0.3,3
0.77,0.57,0.48,0.5,0.88,0.53,0.2,3
0.74,0.78,0.48,0.5,0.75,0.54,0.15,3
0.68,0.76,0.48,0.5,0.84,0.45,0.27,3
0.56,0.68,0.48,0.5,0.77,0.36,0.45,3
0.65,0.51,0.48,0.5,0.66,0.54,0.33,3
0.52,0.81,0.48,0.5,0.72,0.38,0.38,3
0.64,0.57,0.48,0.5,0.7,0.33,0.26,3
0.6,0.76,1,0.5,0.77,0.59,0.52,3
0.69,0.59,0.48,0.5,0.77,0.39,0.21,3
0.63,0.49,0.48,0.5,0.79,0.45,0.28,3
0.71,0.71,0.48,0.5,0.68,0.43,0.36,3
0.68,0.63,0.48,0.5,0.73,0.4,0.3,3
0.74,0.49,0.48,0.5,0.42,0.54,0.36,4
0.7,0.61,0.48,0.5,0.56,0.52,0.43,4
0.66,0.86,0.48,0.5,0.34,0.41,0.36,4
0.73,0.78,0.48,0.5,0.58,0.51,0.31,4
0.65,0.57,0.48,0.5,0.47,0.47,0.51,4
0.72,0.86,0.48,0.5,0.17,0.55,0.21,4
0.67,0.7,0.48,0.5,0.46,0.45,0.33,4
0.67,0.81,0.48,0.5,0.54,0.49,0.23,4
0.67,0.61,0.48,0.5,0.51,0.37,0.38,4
0.63,1,0.48,0.5,0.35,0.51,0.49,4
0.57,0.59,0.48,0.5,0.39,0.47,0.33,4
0.71,0.71,0.48,0.5,0.4,0.54,0.39,4
0.66,0.74,0.48,0.5,0.31,0.38,0.43,4
0.67,0.81,0.48,0.5,0.25,0.42,0.25,4
0.64,0.72,0.48,0.5,0.49,0.42,0.19,4
0.68,0.82,0.48,0.5,0.38,0.65,0.56,4
0.32,0.39,0.48,0.5,0.53,0.28,0.38,4
0.7,0.64,0.48,0.5,0.47,0.51,0.47,4
0.63,0.57,0.48,0.5,0.49,0.7,0.2,4
0.69,0.65,0.48,0.5,0.63,0.48,0.41,4
0.43,0.59,0.48,0.5,0.52,0.49,0.56,4
0.74,0.56,0.48,0.5,0.47,0.68,0.3,4
0.71,0.57,0.48,0.5,0.48,0.35,0.32,4
0.61,0.6,0.48,0.5,0.44,0.39,0.38,4
0.59,0.61,0.48,0.5,0.42,0.42,0.37,4
0.74,0.74,0.48,0.5,0.31,0.53,0.52,4