import numpy as np
class tree:
def __init__(self, varNo, value, operator):
self.rootNode = treeNode(0, value, varNo=varNo, operator=operator)
self.nodes = []
self.nodes.append(self.rootNode)
self.leafNodes = []
self.leafNodes.append(0)
def addNode(self, ChildOf, branch, value, operator='<', varNo=0):
node = treeNode(len(self.nodes),value,ChildOf=ChildOf,operator=operator,varNo=varNo)
self.leafNodes.append(node.number)
self.nodes.append(node)
parent = self.nodes[ChildOf]
if branch is True:
parent.leftTrue = node
else:
parent.rightFalse = node
if parent.leftTrue is not None and parent.rightFalse is not None:
toDelete = self.leafNodes.index(parent.number)
del self.leafNodes[toDelete]
return(node.number)
def trace(self, x):
traceRoute = self.rootNode.trace(x)[0]
return traceRoute
def eval(self, x):
traceRoute = self.trace(x)
y = np.zeros(len(traceRoute))
for i in range(len(y)):
y[i] = self.nodes[traceRoute[i][-1]]()
return(y)
def weightedPathLength(self, X):
traceroute = self.trace(X)
sum = 0
for i in range(len(traceroute)):
sum = sum + len(traceroute[i]) -1
return(sum)
def numberOfLeafs(self):
return(len(self.leafNodes))
def print(self, maxlevels=-1):
ongoingstring = "\\node {"+self.rootNode.conditionString()+" }\n"
if self.rootNode.leftTrue is not None:
ongoingstring = self.rootNode.leftTrue.addMyString(ongoingstring, maxlevels, ' ')
if self.rootNode.rightFalse is not None:
ongoingstring = self.rootNode.rightFalse.addMyString(ongoingstring, maxlevels, ' ')
ongoingstring = ongoingstring + " ;"
return(ongoingstring)
class treeNode:
def __init__(self, number, value, ChildOf=None, operator='<', varNo=0):
self.number = number
self.childOf = ChildOf
self.leftTrue = None
self.rightFalse = None
self.value = value
self.varNo = varNo
self.operator = operator
def __call__(self):
return(self.value)
def leafNode(self):
if self.leftTrue is not None and self.rightFalse is not None:
return(False)
else:
return(True)
def evalCondition(self, x):
if self.operator == '=':
cond = x[:, self.varNo] == self.value
elif self.operator == '<':
cond = x[:, self.varNo] < self.value
else: # case >
cond = x[:, self.varNo] > self.value
return cond
def trace(self, x, index=None, traceRoute=None):
if index is None:
index = np.arange(len(x))
if traceRoute is None:
traceRoute = [[] for x in range(len(x))]
for k in index:
traceRoute[k].append(self.number)
if self.leafNode():
return (traceRoute, index)
cond = self.evalCondition(x[index])
trueIndex = index[cond]
falseIndex = index[~cond]
if self.leftTrue is not None and trueIndex.size != 0:
traceRoute = self.leftTrue.trace(x, trueIndex, traceRoute)[0]
if self.rightFalse is not None and falseIndex.size != 0:
traceRoute = self.rightFalse.trace(x, falseIndex, traceRoute)[0]
return (traceRoute, index)
def conditionString(self):
if not self.leafNode():
mystring = "$\\tiny %d \\mathrel{||} x[%d] %s %.2f$" % (self.number, self.varNo, self.operator, self.value)
else:
mystring = "$\\tiny %d \\mathrel{||} %.2f$" % (self.number, self.value)
return(mystring)
def addMyString(self, ongoingstring, levelsleft=-1, indent=''):
if levelsleft == 0:
return ongoingstring
if not self.leafNode():
ongoingstring = ongoingstring + indent + "child { node {"+self.conditionString()+" }\n"
else:
ongoingstring = ongoingstring + indent + "child { node[fill=gray!30] {"+self.conditionString()+" }\n"
if self.leftTrue is not None:
ongoingstring = self.leftTrue.addMyString(ongoingstring, levelsleft-1, indent + ' ')
if self.rightFalse is not None:
ongoingstring = self.rightFalse.addMyString(ongoingstring, levelsleft-1, indent + ' ')
ongoingstring = ongoingstring + indent + "}\n"
return(ongoingstring)
if __name__ == '__main__':
np.random.seed(3)
bicycleTree = tree(0,1,'=')
No = bicycleTree.addNode(0,False,1,varNo=1,operator='=')
bicycleTree.addNode(No,False,0)
bicycleTree.addNode(No,True,1)
No = bicycleTree.addNode(0,True,1,varNo=2,operator='=')
bicycleTree.addNode(No,True,0)
No = bicycleTree.addNode(No,False,1,varNo=3,operator='=')
bicycleTree.addNode(No,True,0)
bicycleTree.addNode(No,False,1)
import time
x = np.array([True,False,False,False]).reshape(1,4)
y = bicycleTree.eval(x)
traceRoute = bicycleTree.trace(x)
print(traceRoute)
print(y)
x = np.random.randint(2, size=(1000000,4))
t1 = time.clock()
y = bicycleTree.eval(x)
t2 = time.clock()
print(t2-t1)
traceRoute = bicycleTree.trace(x)
:::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
def weightedSelfInformation( x ):
y = 0 if x <= 0 else x*np.log2(x)
return(y)
def CalConditionalEntropy(y,D,Feature):
sizeDataBase = D.shape[0]
D = D.astype(bool)
TrueFeatureDatabase = np.sum(D[:,Feature])
FalseFeatureDatabase = sizeDataBase - TrueFeatureDatabase
PFeatureTrue = TrueFeatureDatabase/sizeDataBase
PFeatureFalse = FalseFeatureDatabase/sizeDataBase
Htrue = 0
if PFeatureTrue>0:
P_AB_True = TrueFeatureDatabase - np.sum(np.logical_and(D[:,Feature],y))
P_AB_False = TrueFeatureDatabase - P_AB_True
P_AB_True = P_AB_True/TrueFeatureDatabase
P_AB_False = P_AB_False/TrueFeatureDatabase
Htrue = PFeatureTrue * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
Hfalse = 0
if PFeatureFalse>0:
P_AB_True = FalseFeatureDatabase - np.sum(np.logical_and(~D[:,Feature],y))
P_AB_False = FalseFeatureDatabase - P_AB_True
P_AB_True = P_AB_True/FalseFeatureDatabase
P_AB_False = P_AB_False/FalseFeatureDatabase
Hfalse = PFeatureFalse * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
H = -Htrue - Hfalse
return(H)
dataSet = np.array([[ 1 , 0 , 0 , 0 , 1 ] , [ 0 , 0 , 0 , 0 , 0 ] ,
[ 0 , 0 , 1 , 0 , 1 ] , [ 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 0 , 1 , 0 , 1 ] , [ 0 , 0 , 1 , 1 , 1 ] ,
[ 1 , 0 , 0 , 1 , 0 ] , [ 1 , 1 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 0 , 0 ] ])
x = dataSet[:,0:4]
y = dataSet[:,4]
for i in range(4):
H = CalConditionalEntropy(y,x,i)
print(H)
:::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bDecisionTree:
def _calGiniImpurity(self,y):
unique, counts = np.unique(y, return_counts=True)
N = counts/len(y)
G = 1 - np.sum(N**2)
return(G)
def _bestSplit(self,X,y,feature):
G = 1
bestSplit = np.inf
XSort = np.unique(X[:,feature].round(self.xDecimals)) #*\label{code:CARTunique}
XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2 #*\label{code:CARTMittelwert}
for i in range(XDiff.shape[0]):
index = np.less(X[:,feature], XDiff[i])
G1 = self._calGiniImpurity(y[index])
G2 = self._calGiniImpurity(y[~index])
GSplit = len(y[index])/len(y)*G1 + len(y[~index])/len(y)*G2 #*\label{code:CARTGewichtung}
if G > GSplit:
G = GSplit
bestSplit = XDiff[i]
return (bestSplit, G)
def _chooseFeature(self,X,y):
G = np.zeros(X.shape[1])
bestSplit = np.zeros(X.shape[1])
for i in range(X.shape[1]):
( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
smallest = np.argmin(G) #*\label{code:CARTargmin}
return (G[smallest], bestSplit[smallest],smallest)
def _ComputeValue(self,y):
unique, counts = np.unique(y, return_counts=True)
i = np.argmax(counts)
return(unique[i])
def __init__(self,threshold = 0.1, xDecimals = 8, minLeafNodeSize=3):
self.bTree = None
self.threshold = threshold
self.xDecimals = xDecimals
self.minLeafNodeSize = minLeafNodeSize
def _GenTree(self,X,y,parentNode,branch):
commonValue = self._ComputeValue(y)
initG = self._calGiniImpurity(y)
if initG < self.threshold or X.shape[0] <= self.minLeafNodeSize: #*\label{code:CART-B1Start}
self.bTree.addNode(parentNode,branch,commonValue)
return() #*\label{code:CART-B1End}
(G, bestSplit ,chooseA) = self._chooseFeature(X,y)
if G > 0.98*initG : #*\label{code:CART-B2Start}
self.bTree.addNode(parentNode,branch,commonValue)
return() #*\label{code:CART-B2End}
if parentNode == None:
self.bTree = tree(chooseA, bestSplit, '<')
myNo = 0
else:
myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
index = np.less(X[:,chooseA],bestSplit) #*\label{code:CART-AufteilenStart}
XTrue = X[index,:]
yTrue = y[index]
XFalse = X[~index,:]
yFalse = y[~index] #*\label{code:CART-AufteilenEnd}
if XTrue.shape[0] > self.minLeafNodeSize: #*\label{code:CART-B3True}
self._GenTree(XTrue,yTrue,myNo,True)
else:
commonValue = self._ComputeValue(yTrue)
self.bTree.addNode(myNo,True,commonValue)
if XFalse.shape[0] > self.minLeafNodeSize: #*\label{code:CART-B3False}
self._GenTree(XFalse,yFalse,myNo,False)
else:
commonValue = self._ComputeValue(yFalse)
self.bTree.addNode(myNo,False,commonValue)
return()
def fit(self, X,y):
self._GenTree(X,y,None,None)
def predict(self, X):
return(self.bTree.eval(X))
def decision_path(self, X):
return(self.bTree.trace(X))
def weightedPathLength(self,X):
return(self.bTree.weightedPathLength(X))
def numberOfLeafs(self):
return(self.bTree.numberOfLeafs())
if __name__ == '__main__':
fFloat = open("iris.csv","r")
dataset = np.loadtxt(fFloat, delimiter=",")
fFloat.close()
np.random.seed(42)
MainSet = np.arange(0,dataset.shape[0])
Trainingsset = np.random.choice(dataset.shape[0], 120, replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = dataset[Trainingsset,:]
yTrain = dataset[Trainingsset,4]
XTest = dataset[Testset,:]
yTest = dataset[Testset,4]
myTree = bDecisionTree()
myTree.fit(XTrain,yTrain)
yPredict = myTree.predict(XTest)
print(yPredict - yTest)
:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTree import bRegressionTree
np.random.seed(42)
x = 10*np.random.rand(1000,2)
y = np.zeros(1000)
index = np.flatnonzero(x[:,0]<2 br="">y[index] = 1
index = np.flatnonzero(np.logical_and(x[:,0] >= 2,x[:,1]<5 br="">y[index] = 15>2>
MainSet = np.arange(0,1000)
Trainingsset = np.random.choice(1000, 800, replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = x[Trainingsset,:]
yTrain = y[Trainingsset]
XTest = x[Testset,:]
yTest = y[Testset]
smallTree = bRegressionTree()
smallTree.fit(XTrain,yTrain)
noise = 0.1*np.random.rand(1000) - 0.05
y = y + noise
yTrain = y[Trainingsset]
yTest = y[Testset]
complexTree = bRegressionTree()
complexTree.fit(XTrain,yTrain)
yPredict = complexTree.predict(XTest)
error = np.abs(yPredict - yTest)
print(error.mean())
yPredict = smallTree.predict(XTest)
error = np.abs(yPredict - yTest)
print(error.mean())
ValSet = np.random.choice(800, 200, replace=False)
xVal = XTrain[ValSet]
yVal = yTrain[ValSet]
Trainingsset = np.delete(Trainingsset,ValSet)
XTrain = x[Trainingsset,:]
yTrain = y[Trainingsset]
preTree = bRegressionTree(threshold = 2.5*10**-1)
preTree.fit(XTrain,yTrain)
yPredict = preTree.predict(xVal)
error = np.abs(yPredict - yVal)
print(error.mean())
::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bRegressionTree:
def _calLRSS(self,y):
yMean = np.sum(y)/len(y)
L2 = np.sum( (y-yMean)**2)
return(L2)
def _bestSplit(self,X,y,feature):
RSS = np.inf #*\label{code:RSSinit}
bestSplit = np.inf
XSort = np.unique(X[:,feature].round(self.xDecimals))
XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2
for i in range(XDiff.shape[0]):
index = np.less(X[:,feature], XDiff[i])
if not (np.all(index) or np.all(~index)):
RSS_1 = self._calLRSS(y[index])
RSS_2 = self._calLRSS(y[~index])
RSSSplit = RSS_1 + RSS_2 #*\label{code:CARTJustAdd}
if RSS > RSSSplit:
RSS = RSSSplit
bestSplit = XDiff[i]
return (bestSplit, RSS)
def _ComputeValue(self,y):
return(np.sum(y)/len(y))
def _chooseFeature(self,X,y):
G = np.zeros(X.shape[1])
bestSplit = np.zeros(X.shape[1])
for i in range(X.shape[1]):
( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
smallest = np.argmin(G)
return (G[smallest], bestSplit[smallest],smallest)
def __init__(self,threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3):
self.bTree = None
self.threshold = threshold
self.xDecimals = xDecimals
self.minLeafNodeSize = minLeafNodeSize
def _GenTree(self,X,y,parentNode,branch):
commonValue = self._ComputeValue(y)
initG = self._calLRSS(y)
if initG < self.threshold or X.shape[0] <= self.minLeafNodeSize:
self.bTree.addNode(parentNode,branch,commonValue)
return()
(G, bestSplit ,chooseA) = self._chooseFeature(X,y)
if G > initG :
self.bTree.addNode(parentNode,branch,commonValue)
return()
if parentNode == None:
self.bTree = tree(chooseA, bestSplit, '<')
myNo = 0
else:
myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
index = np.less(X[:,chooseA],bestSplit)
XTrue = X[index,:]
yTrue = y[index]
XFalse = X[~index,:]
yFalse = y[~index]
if XTrue.shape[0] > self.minLeafNodeSize:
self._GenTree(XTrue,yTrue,myNo,True)
else:
commonValue = self._ComputeValue(yTrue)
self.bTree.addNode(myNo,True,commonValue)
if XFalse.shape[0] > self.minLeafNodeSize:
self._GenTree(XFalse,yFalse,myNo,False)
else:
commonValue = self._ComputeValue(yFalse)
self.bTree.addNode(myNo,False,commonValue)
return()
def fit(self, X,y):
self._GenTree(X,y,None,None)
def predict(self, X):
return(self.bTree.eval(X))
def decision_path(self, X):
return(self.bTree.trace(X))
def weightedPathLength(self,X):
return(self.bTree.weightedPathLength(X))
def numberOfLeafs(self):
return(self.bTree.numberOfLeafs())
if __name__ == '__main__':
np.random.seed(42)
numberOfSamples = 10000
X = np.random.rand(numberOfSamples,2)
Y = ( np.sin(2*np.pi*X[:,0]) + np.cos(np.pi*X[:,1])) * np.exp(1 -X[:,0]**2 -X[:,1]**2 )
MainSet = np.arange(0,X.shape[0])
Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
Testset = np.delete(MainSet,Trainingsset)
regressionError = np.zeros(5)
for i in range(5):
errorRate = 0.05*i #*\label{code:CARTBeispiel3}
errorFactor = 1 + 2*(np.random.rand(Trainingsset.shape[0]) - 0.5)*errorRate #*\label{code:CARTBeispiel1}
XTrain = X[Trainingsset,:]
yTrain = Y[Trainingsset] * errorFactor #*\label{code:CARTBeispiel2}
XTest = X[Testset,:]
yTest = Y[Testset]
myTree = bRegressionTree(xDecimals=3)
myTree.fit(XTrain,yTrain)
yPredict = myTree.predict(XTest)
yDiff = np.abs(yPredict - yTest)
regressionError[i] = np.mean(yDiff)
import matplotlib.pyplot as plt
fig1 = plt.figure(1)
ax = fig1.add_subplot(1,1,1)
x = np.arange(0,0.25,0.05)
ax.plot(x,regressionError,'o-',c='k')
ax.set_xlabel('% Noise')
ax.set_ylabel('Mean Absolute Error')
from mpl_toolkits.mplot3d import Axes3D
fig2 = plt.figure(2)
ax = fig2.add_subplot(1,1,1, projection='3d')
ax.scatter(XTest[:,0],XTest[:,1],yPredict,alpha=0.6,c =yPredict, cmap='gray')
ax.set_xlabel('x[0]')
ax.set_ylabel('x[1]')
ax.set_zlabel('yPredict')
:::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bRegressionTree:
def _calLRSS(self,y):
yMean = np.sum(y)/len(y)
L2 = np.sum( (y-yMean)**2)
return(L2)
def _bestSplit(self,X,y,feature):
RSS = np.inf
bestSplit = np.inf
XSort = np.unique(X[:,feature].round(self.xDecimals))
XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2
for i in range(XDiff.shape[0]):
index = np.less(X[:,feature], XDiff[i])
if not (np.all(index) or np.all(~index)):
RSS_1 = self._calLRSS(y[index])
RSS_2 = self._calLRSS(y[~index])
RSSSplit = RSS_1 + RSS_2
if RSS > RSSSplit:
RSS = RSSSplit
bestSplit = XDiff[i]
return (bestSplit, RSS)
def _ComputeValue(self,y):
return(np.sum(y)/len(y))
def _chooseFeature(self,X,y):
G = np.inf*np.ones(X.shape[1]) #*\label{code:RF:0}
bestSplit = np.zeros(X.shape[1])
if self.n == 0: #*\label{code:RF:2}
feature = np.arange(X.shape[1])
elif self.n == -1:
feature = np.random.choice(X.shape[1],int(np.sqrt(X.shape[1])),replace=False)
else:
feature = np.random.choice(X.shape[1],self.n,replace=False)
for i in feature: #*\label{code:RF:3}
( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
smallest = np.argmin(G) #*\label{code:RF:4}
return (G[smallest], bestSplit[smallest],smallest)
def __init__(self,n = 0, threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3):
self.n = 0
self.bTree = None
self.threshold = threshold
self.xDecimals = xDecimals
self.minLeafNodeSize = minLeafNodeSize
def _GenTree(self,X,y,parentNode,branch):
commonValue = self._ComputeValue(y)
initG = self._calLRSS(y)
if initG < self.threshold or X.shape[0] <= self.minLeafNodeSize:
self.bTree.addNode(parentNode,branch,commonValue)
return()
(G, bestSplit ,chooseA) = self._chooseFeature(X,y)
if G > initG :
self.bTree.addNode(parentNode,branch,commonValue)
return()
if parentNode == None:
self.bTree = tree(chooseA, bestSplit, '<')
myNo = 0
else:
myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
index = np.less(X[:,chooseA],bestSplit)
XTrue = X[index,:]
yTrue = y[index]
XFalse = X[~index,:]
yFalse = y[~index]
if XTrue.shape[0] > self.minLeafNodeSize:
self._GenTree(XTrue,yTrue,myNo,True)
else:
commonValue = self._ComputeValue(yTrue)
self.bTree.addNode(myNo,True,commonValue)
if XFalse.shape[0] > self.minLeafNodeSize:
self._GenTree(XFalse,yFalse,myNo,False)
else:
commonValue = self._ComputeValue(yFalse)
self.bTree.addNode(myNo,False,commonValue)
return()
def fit(self, X,y):
self._GenTree(X,y,None,None)
def predict(self, X):
return(self.bTree.eval(X))
def decision_path(self, X):
return(self.bTree.trace(X))
def weightedPathLength(self,X):
return(self.bTree.weightedPathLength(X))
def numberOfLeafs(self):
return(self.bTree.numberOfLeafs())
if __name__ == '__main__':
np.random.seed(42)
numberOfSamples = 10000
X = np.random.rand(numberOfSamples,2)
Y = ( np.sin(2*np.pi*X[:,0]) + np.cos(np.pi*X[:,1])) * np.exp(1 -X[:,0]**2 -X[:,1]**2 )
MainSet = np.arange(0,X.shape[0])
Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
Testset = np.delete(MainSet,Trainingsset)
regressionError = np.zeros(5)
for i in range(5):
errorRate = 0.05*i
errorFactor = 1 + 2*(np.random.rand(Trainingsset.shape[0]) - 0.5)*errorRate
XTrain = X[Trainingsset,:]
yTrain = Y[Trainingsset] * errorFactor
XTest = X[Testset,:]
yTest = Y[Testset]
myTree = bRegressionTree(xDecimals=3)
myTree.fit(XTrain,yTrain)
yPredict = myTree.predict(XTest)
yDiff = np.abs(yPredict - yTest)
regressionError[i] = np.mean(yDiff)
import matplotlib.pyplot as plt
fig1 = plt.figure(1)
ax = fig1.add_subplot(1,1,1)
x = np.arange(0,0.25,0.05)
ax.plot(x,regressionError,'o-')
ax.set_xlabel('% Noise')
ax.set_ylabel('Mean Absolute Error')
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
fig2 = plt.figure(2)
ax = fig2.add_subplot(1,1,1, projection='3d')
ax.scatter(XTest[:,0],XTest[:,1],yPredict,alpha=0.6,c =yPredict, cmap=cm.jet)
ax.set_xlabel('x[0]')
ax.set_ylabel('x[1]')
ax.set_zlabel('yPredict')
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTree import bRegressionTree
f = open("hourCleanUp.csv")
header = f.readline().rstrip('\n') # skip the header
featureNames = header.split(',')
dataset = np.loadtxt(f, delimiter=",")
f.close()
X = dataset[:,0:13]
Y = dataset[:,15]
#X = np.delete(X,6, axis=1)
index = np.flatnonzero(X[:,8]==4)
X = np.delete(X,index, axis=0)
Y = np.delete(Y,index, axis=0)
np.random.seed(42)
MainSet = np.arange(0,X.shape[0])
Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = X[Trainingsset,:]
yTrain = Y[Trainingsset]
XTest = X[Testset,:]
yTest = Y[Testset]
myTree = bRegressionTree(minLeafNodeSize=15,threshold=2)
myTree.fit(XTrain,yTrain)
yPredict = np.round(myTree.predict(XTest))
import matplotlib.pyplot as plt
plt.figure(1)
yDiff = yPredict - yTest
plt.hist(yDiff,22,color='gray')
plt.xlim(-200,200)
plt.title('Fehler auf Testdaten')
plt.figure(2)
plt.hist(yTest,22,color='gray')
plt.title('Testdaten')
print('Mittlere Abweichung: %e ' % (np.mean(np.abs(yDiff))))
:::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
def weightedSelfInformation( x ):
y = 0 if x <= 0 else x*np.log2(x)
return(y)
def CalConditionalEntropy(y,D,Feature):
sizeDataBase = D.shape[0]
D = D.astype(bool)
TrueFeatureDatabase = np.sum(D[:,Feature])
FalseFeatureDatabase = sizeDataBase - TrueFeatureDatabase
PFeatureTrue = TrueFeatureDatabase/sizeDataBase
PFeatureFalse = FalseFeatureDatabase/sizeDataBase
Htrue = 0
if PFeatureTrue>0:
P_AB_True = TrueFeatureDatabase - np.sum(np.logical_and(D[:,Feature],y))
P_AB_False = TrueFeatureDatabase - P_AB_True
P_AB_True = P_AB_True/TrueFeatureDatabase
P_AB_False = P_AB_False/TrueFeatureDatabase
Htrue = PFeatureTrue * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
Hfalse = 0
if PFeatureFalse>0:
P_AB_True = FalseFeatureDatabase - np.sum(np.logical_and(~D[:,Feature],y))
P_AB_False = FalseFeatureDatabase - P_AB_True
P_AB_True = P_AB_True/FalseFeatureDatabase
P_AB_False = P_AB_False/FalseFeatureDatabase
Hfalse = PFeatureFalse * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
H = -Htrue - Hfalse
return(H)
dataSet = np.array([[ 1 , 0 , 0 , 0 , 1 ] , [ 0 , 0 , 0 , 0 , 0 ] ,
[ 0 , 0 , 1 , 0 , 1 ] , [ 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 0 , 1 , 0 , 1 ] , [ 0 , 0 , 1 , 1 , 1 ] ,
[ 1 , 0 , 0 , 1 , 0 ] , [ 1 , 1 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 0 , 0 ] ])
x = dataSet[:,0:4]
y = dataSet[:,4]
for i in range(4):
H = CalConditionalEntropy(y,x,i)
print(H)
from binaryTree import tree
class ID3BinaryTree:
def __init__(self):
self.bTree = None
def _chooseFeature(self,X,y):
# berechne die bedingte Entropie
H = np.zeros(X.shape[1])
for i in range(len(H)):
H[i] = CalConditionalEntropy(y,X,i)
chooseA = np.argmin(H) # Waehle die kleinste bedingte Entropie aus
return(chooseA)
def _GenTree(self,X,y,parentNode,branch,A):
if parentNode == None: # Wurzelknoten muss noch angelegt werden
A = np.arange(X.shape[1])
else:
if len(y) == np.sum(y): # Nur noch positive Faelle vorhanden?
self.bTree.addNode(parentNode,branch,True)
return()
elif 0 == np.sum(y):
self.bTree.addNode(parentNode,branch,False)
return()
commonValue = True if np.sum(y)>len(y)/2 else False
if X.shape[0] == 0: # Keine Merkmale mehr vorhanden?
self.bTree.addNode(parentNode,branch,commonValue)
return()
chooseA = self._chooseFeature(X,y)
if parentNode == None: # Wurzelknoten muss noch angelegt werden
self.bTree = tree(chooseA, True, '=')
myNo = 0
else: # erzeuge neuen Knoten im Baum
myNo = self.bTree.addNode(parentNode,branch,True, operator='=', varNo=A[chooseA])
# loesche Mermal in X in A
index = np.flatnonzero(np.logical_and(X[:,chooseA], 1))
X = np.delete(X,chooseA,axis=1)
A = np.delete(A,chooseA,axis=0)
# teile X auf
XTrue = X[index,:]
yTrue = y[index]
XFalse = np.delete(X,index,axis=0)
yFalse = np.delete(y,index,axis=0)
if XTrue.shape[0]>0:
self._GenTree(XTrue,yTrue,myNo,True,A)
else:
self.bTree.addNode(myNo,True,commonValue)
if XFalse.shape[0]>0:
self._GenTree(XFalse,yFalse,myNo,False,A)
else:
self.bTree.addNode(myNo,False,commonValue)
return()
def fit(self, X,y):
self._GenTree(X,y,None,None,None)
def predict(self, X):
return(self.bTree.eval(X))
def decisionPath(self, X):
return(self.bTree.trace(X))
def weightedPathLength(self,X):
return(self.bTree.weightedPathLength(X))
def numberOfLeafs(self):
return(self.bTree.numberOfLeafs())
myTree = ID3BinaryTree()
myTree.fit(x,y)
::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTreeRF import bRegressionTree
class randomForestRegression:
def __init__(self,noOfTrees=10,threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3, perc=1):
self.perc = perc
self.threshold = threshold
self.xDecimals = xDecimals
self.minLeafNodeSize = minLeafNodeSize
self.bTree = []
self.noOfTrees = noOfTrees
for i in range(noOfTrees):
tempTree = bRegressionTree(threshold = self.threshold, xDecimals = self.xDecimals , minLeafNodeSize=self.minLeafNodeSize)
self.bTree.append(tempTree)
def fit(self,X,y):
self.samples = []
for i in range(self.noOfTrees):
bootstrapSample = np.random.randint(X.shape[0],size=int(self.perc*X.shape[0]))
self.samples.append(bootstrapSample) #*\label{code:realRF:0}
bootstrapX = X[bootstrapSample,:]
bootstrapY = y[bootstrapSample]
self.bTree[i].fit(bootstrapX,bootstrapY)
def predict(self,X):
ypredict = np.zeros(X.shape[0])
for i in range(self.noOfTrees):
ypredict += self.bTree[i].predict(X)
ypredict = ypredict/self.noOfTrees
return(ypredict)
if __name__ == '__main__':
f = open("hourCleanUp.csv") #*\label{code:realRF:1}
header = f.readline().rstrip('\n')
featureNames = header.split(',')
dataset = np.loadtxt(f, delimiter=",")
f.close()
X = dataset[:,0:13]
Y = dataset[:,15]
index = np.flatnonzero(X[:,8]==4)
X = np.delete(X,index, axis=0)
Y = np.delete(Y,index, axis=0)
np.random.seed(42)
MainSet = np.arange(0,X.shape[0])
Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = X[Trainingsset,:]
yTrain = Y[Trainingsset]
XTest = X[Testset,:]
yTest = Y[Testset] #*\label{code:realRF:2}
myForest = randomForestRegression(noOfTrees=24,minLeafNodeSize=5,threshold=2)
myForest.fit(XTrain,yTrain)
yPredict = np.round(myForest.predict(XTest))
yDiff = yPredict - yTest
print('Mittlere Abweichung: %e ' % (np.mean(np.abs(yDiff))))
:::::::::::::::::::::::::::::::::::::::::::::::::
Mittwoch, 29. Mai 2019
Abonnieren
Kommentare zum Post (Atom)
Keine Kommentare:
Kommentar veröffentlichen
Hinweis: Nur ein Mitglied dieses Blogs kann Kommentare posten.