用Python计算信息增益

时间:2020-10-14 14:50:25

标签: python tree entropy

所以我得到了一个当前正在尝试拆分的数据集,然后计算其熵,然后得到信息增益。我认为我已经正确实现了它,但是运行它时却没有得到正确的输出,因为这表明我的索引编制不正确?有谁知道这是怎么回事,或者有其他方法可以对此进行编码?

get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np

from sklearn import datasets as ds
from sklearn.decomposition import PCA
from sklearn import preprocessing

import matplotlib.pyplot as plt

data_all = ds.load_breast_cancer()

x = data_all.data
y = data_all.target

y_names = data_all.target_names 

feature_names = data_all.feature_names

split = int(x.shape[0] * 0.6)

x_train = x[:split,:]
y_train = y[:split]

x_test = x[split:,:]
y_test = y[split:]

print('Training set size:', x_train.shape[0])
print('Test set size:', x_test.shape[0])

def calculate_entropy(y):
    
    entropy = 0.0
    n = len(y)
    
    counts = np.bincount(y)
    counts = counts.astype(float)
    
    div = counts / n
    div = div.astype(float)
    
    for element in div:
        if element == 0.0:
            entropy = 0.0
        else:
            entropy -= element * np.log2(element)
    
    return entropy

print("The entropy of 'y' is: {:.4f}".format(calculate_entropy(y)))
    
def find_split(x, y):
    """Given a dataset and its target values, this finds the optimal combination
    of feature and split point that gives the maximum information gain."""
    
    # Need the starting entropy so we can measure improvement...
    start_entropy = calculate_entropy(y)
    
    # Best thus far, initialised to a dud that will be replaced immediately...
    best = {'infogain' : -np.inf}
    # Loop every possible split of every dimension...
    for i in range(x.shape[1]):
        for split in np.unique(x[:,i]):
            
            left_indices = x[i] <= split
            right_indices = x[i] > split
            
            total_left = len(x[i][x[i] <= split])
            total_right = len(x[i][x[i] > split])
            total_of_everything = total_left + total_right
            
            fraction_of_right = total_right/total_of_everything
            fraction_of_left = total_left/total_of_everything
            
            entropy_of_left_split = calculate_entropy(left_indices)
            entropy_of_right_split = calculate_entropy(right_indices)
            
            infogain = start_entropy - (entropy_of_left_split*fraction_of_left) - (entropy_of_right_split*fraction_of_right)
            
            
            if infogain > best['infogain']:
                best = {'feature' : i,
                        'split' : split,
                        'infogain' : infogain, 
                        'left_indices' : left_indices,
                        'right_indices' : right_indices}
    return best

0 个答案:

没有答案