import numpy as np
from collections import Counter
###################################
def entropy(D):
count_array=np.array(Counter(D).values()) P=count_array/float(count_array.sum()) H=np.dot(-P,np.log2(P)) return Hdef condition_entropy(D,A):
A=np.array(A) D=np.array(D) H_da=0 for i in np.unique(A): index_i=np.ravel(np.argwhere(A==i)) Di=D[index_i] H_Di=entropy(Di) pi=float(Di.size)/D.size H_da=H_da+pi*H_Di return H_da###################################
x1=[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2] x2=[0,0,1,1,0,0,0,1,0,0,0,0,1,1,0] x3=[0,0,0,1,0,0,0,1,1,1,1,1,0,0,0] x4=[0,1,1,0,0,0,1,1,2,2,2,1,1,2,0] y =[0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]X=np.c_[x1,x2,x3,x4]
y=np.array(y) Hy=entropy(y) Hyx1=condition_entropy(y,x1) Hyx2=condition_entropy(y,x2) Hyx3=condition_entropy(y,x3) Hyx4=condition_entropy(y,x4)g_yx1=Hy-Hyx1
g_yx2=Hy-Hyx2 g_yx3=Hy-Hyx3 g_yx4=Hy-Hyx4print Hy #熵 H(y)
print Hyx1 #条件熵 H(y|x1) print Hyx2 print Hyx3 print Hyx4print g_yx1 #信息增益g(y,x1)
print g_yx2 print g_yx3 print g_yx4 ###############################################yuanzhen@yuanzhen-ThinkPad-X121e:~/P_script$ python mydecisiontree.py
0.970950594455 0.887943094599 0.647300396303 0.550977500433 0.607961031918 0.0830074998558 0.323650198152 0.419973094022 0.362989562537 #################################计算也可以使用np.apply_along_axis,如下:
H_X=np.apply_along_axis(condition_entropy,0,X,y)
print H_X g_Xy=Hy-H_X print g_Xy print g_Xy.argmax()###############################
[ 0.88794309 0.6473004 0.5509775 0.60796103]
[ 0.0830075 0.3236502 0.41997309 0.36298956] 2 不过需要注意,使用np.apply_along_axis时需要那个函数 condition_entropy(D,A)变为condition_entropy(A,D)信息增益比:特征的信息增益除以特征的熵
HX=np.apply_along_axis(entropy,0,X)
信息增益比:gr_Xy=g_Xy/HX