Я пытаюсь выполнить следующую задачу:
Для заданного столбца данных (хранящегося в виде массива с нулевыми данными) «жадно» скопировать данные, где я проверяю текущий объект и следующийчтобы вычислить его энтропию.
Псевдокод будет выглядеть так:
split_data(feature):
BestValues = 0
For Each Value in Feature:
Calculate CurrentGain As InformationGain(Entropy(Feature) - Entropy(Value + Next Value))
If CurrentGain > BestGain:
Set BestValues = Value,Next Value
Set BestGain = CurrentGain
return BestValues
В настоящее время у меня есть коды Python, которые выглядят следующим образом:
# This function finds the total entropy for a given dataset
def entropy(dataset):
# Declare variables
total_entropy = 0
# Determine classes and numby of items in each class
classes = numpy.unique(dataset[:,-1])
# Loop through each "class", or label
for aclass in classes:
# Create temp variables
currFreq = 0
currProb = 0
# Loop through each row in the dataset
for row in dataset:
# If that row has the same label as the current class, implement the frequency
if (aclass == row[-1]):
currFreq = currFreq + 1
# If not, continue
else:
continue
# The current probability is the # of occurences / total occurences
currProb = currFreq / len(dataset)
# If it is 0, then the entropy is 0. If not, use entropy formula
if (currFreq > 0):
total_entropy = total_entropy + (-currProb * math.log(currProb, 2))
else:
return 0
# Return the total entropy
return total_entropy
# This function gets the entropy for a single attribute
def entropy_by_attribute(dataset, feature):
# The attribute is the specific feature of the dataset
attribute = dataset[:,feature]
# The target_variables are the unique values in that feature
target_variables = numpy.unique(dataset[:,-1])
# The unique values in the column we are evaluating
variables = numpy.unique(attribute)
# The entropy for the attribute in question
entropy_attribute = 0
# Loop through each of the possible values
for variable in variables:
denominator = 0
entropy_each_feature = 0
# For every row in the column
for row in attribute:
# If it is equal to the current value we are estimating, increase your denominator
if row == variable:
denominator = denominator + 1
# Now loop through each class
for target_variable in target_variables:
numerator = 0
# Loop through the dataset
for row in dataset:
index = 0
# if the current row in the feature is equal to the value you are evaluating
# and the label is equal to the label you are evaluating, increase the numerator
if dataset[index][feature] == variable and dataset[index][-1] == target_variable:
numerator = numerator + 1
else:
continue
index = index + 1
# use eps to protect from divide by 0
fraction = numerator/(denominator+numpy.finfo(float).eps)
entropy_each_feature = entropy_each_feature + (-fraction * math.log(fraction+numpy.finfo(float).eps, 2))
# Now calculate the total entropy for the attribute in question
big_fraction = denominator / len(dataset)
entropy_attribute = entropy_attribute +(-big_fraction*entropy_each_feature)
# Return that entropy
return entropy_attribute
# This function calculates the information gain
def infogain(dataset, feature):
# Grab the entropy from the total dataset
total_entropy = entropy(dataset)
# Grab the entropy for the current feature being evaluated
feature_entropy = entropy_by_attribute(dataset, feature)
# Calculate the infogain
infogain = float(abs(total_entropy - feature_entropy))
# Return the infogain
return infogain
ОднакоЯ не уверен, как сделать следующее:
- Для функции, захватить ее общую энтропию
- Для отдельной функции, определить энтропию, используя метод биннинга, где я тестирую двазначения
Я не могу логически представить, как разрабатывать коды для выполнения 1 и 2, и я изо всех сил стараюсь.Я буду продолжать обновляться с прогрессом, который я делаю.