Я хочу адаптировать алгоритм дерева решений, который работает для двух тегов: True / False, чтобы тот мог выбрать несколько тегов.
data = [({'Age': 1, 'Quartier': 'A', 'Income': 10}, 'PartyB'),
({'Age': 2, 'Quartier': 'B', 'Income': 20}, "PartyA"),
({'Age': 3, 'Quartier': 'C', 'Income': 30}, 'ABS'),
({'Age': 4, 'Quartier': 'D', 'Income': 40}, 'ABS')]
Я хочу быть в состоянии предсказать этикет следующего: {'Age': 1, 'Quartier': 'A', 'Income': 15}
Тем не менее, я
Моя попытка
def classify(tree, input):
"""classify the input using the given decision tree"""
if tree in [True,False]: # here I need to change to adapt it to multiple categories.
return tree
# otherwise this tree consists of an attribute to split on
# and a dictionnary whose keys are values of that attribute
# and whose values of are subtrees to consider next
print("tree: ", tree)
print("df.TL_Segment.unique(): ",df.TL_Segment.unique())
attribute, subtree_dict = tree
subtree_key = input.get(attribute) # None if input is missing in attribute
if subtree_key not in subtree_dict: # if no subtree for key
subtree_key = None
subtree = subtree_dict[subtree_key]
return classify(subtree, input)
from functools import partial
from statistics import mode
def build_tree_id3(inputs, split_candidates = None):
# if this is our first pass
# all keys of the first input are split candidates
if split_candidates is None:
split_candidates = inputs[0][0].keys()
# count different classes
num_inputs = len(inputs)
num_different_classes = len(set([label for item, label in inputs if label]))
if num_different_classes == 1: return False # only one class? Return this one
if num_different_classes == num_inputs: return False# all classes are different?
if not split_candidates:
return max(set(inputs), key = inputs.count)
# otherwise, split on the best attribute
best_attribute = min(split_candidates, key = partial(partition_entropy_by,inputs))
partitions = partition_by(inputs, best_attribute)
new_candidates = [a for a in split_candidates
if a!= best_attribute]
#recursively build the subtrees
subtrees = {attribute_value: build_tree_id3(subset, new_candidates)
for attribute_value,
subset in partitions.items()}
# subtrees[None] = max(inputs, key=collections.Counter(inputs).get) # if tree is empty we give the most frequent one
subtrees[None] = None
return(best_attribute, subtrees)
Но когда я пытаюсь, я получить «False», а не категории:
tree = build_tree_id3(data)
возвращает:
best_attribute: Age
subtrees: {1: False, 2: False, 3: False, 4: False, None: None}
pero classify(tree, {'Age': 1, 'Quartier': 'A', 'Income': 10})
devuelve:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-462-86da50b80bb6> in <module>
----> 1 classify(tree, {'Age': 1, 'Quartier': 'A', 'Income': 10})
<ipython-input-453-9dce8ec72645> in classify(tree, input)
15
16 subtree = subtree_dict[subtree_key]
---> 17 return classify(subtree, input)
18
<ipython-input-453-9dce8ec72645> in classify(tree, input)
7 # and a dictionnary whose keys are values of that attribute
8 # and whose values of are subtrees to consider next
----> 9 attribute, subtree_dict = tree
10
11 subtree_key = input.get(attribute) # None if input is missing in attribute
TypeError: cannot unpack non-iterable bool object
Действительно, tree
- это False
здесь.
Приложение: здесь используется другая функция
import math
import collections
def entropy(class_probabilities):
"""given a list of class probabilities, compute the entropy"""
return sum(-p * math.log(p,2)
for p in class_probabilities
if p)
def class_probabilities(labels):
total_count = len(labels)
return [count / total_count
for count in collections.Counter(labels).values()]
def data_entropy(labeled_data):
labels = [label for _,label in labeled_data]
probabilities = class_probabilities(labels)
return entropy(probabilities)
def partition_entropy(subsets):
"""find entropy from this partition of data into subsets
subsets is a list of lists of labeled data"""
total_count =sum(len(subset) for subset in subsets)
return sum(data_entropy(subset)* len(subset)/total_count
for subset in subsets)
def partition_by(inputs, attribute):
"""each input is a pair (attriute_dict,label).
returns a dict : attribute_value -> inputs"""
groups = collections.defaultdict(list)
for input in inputs:
key = input[0][attribute] # get the value of the specified attribute
groups[key].append(input) # then add this input to the correct list
return groups
def partition_entropy_by(inputs, attribute):
"""computes the entropy corresponding to the given partition"""
partitions = partition_by(inputs, attribute)
return partition_entropy(partitions.values())