Mittwoch, 29. Mai 2019

Künstliche Intelligenz SourceCode Python Teil-6

import numpy as np
class tree:
    def __init__(self, varNo, value, operator):
        self.rootNode = treeNode(0, value, varNo=varNo, operator=operator)
        self.nodes = []
        self.nodes.append(self.rootNode)
        self.leafNodes = []
        self.leafNodes.append(0)
    def addNode(self, ChildOf, branch, value, operator='<', varNo=0):
        node = treeNode(len(self.nodes),value,ChildOf=ChildOf,operator=operator,varNo=varNo)
        self.leafNodes.append(node.number)
        self.nodes.append(node)
        parent = self.nodes[ChildOf]
        if branch is True:
           parent.leftTrue = node
        else:
           parent.rightFalse = node
        if parent.leftTrue is not None and parent.rightFalse is not None:
            toDelete = self.leafNodes.index(parent.number)
            del self.leafNodes[toDelete]
        return(node.number)
    def trace(self, x):
        traceRoute = self.rootNode.trace(x)[0]
        return traceRoute
    def eval(self, x):
        traceRoute = self.trace(x)
        y = np.zeros(len(traceRoute))
        for i in range(len(y)):
            y[i] = self.nodes[traceRoute[i][-1]]()           
        return(y)
       
    def weightedPathLength(self, X):
        traceroute = self.trace(X)
        sum = 0
        for i in range(len(traceroute)):
            sum = sum + len(traceroute[i]) -1
        return(sum)
       
    def numberOfLeafs(self):
        return(len(self.leafNodes))
    def print(self, maxlevels=-1):
        ongoingstring = "\\node {"+self.rootNode.conditionString()+" }\n"
        if self.rootNode.leftTrue is not None:
            ongoingstring = self.rootNode.leftTrue.addMyString(ongoingstring, maxlevels, '  ')
        if self.rootNode.rightFalse is not None:
            ongoingstring = self.rootNode.rightFalse.addMyString(ongoingstring, maxlevels, '  ')
        ongoingstring = ongoingstring + " ;"
        return(ongoingstring)
class treeNode:
    def __init__(self, number, value, ChildOf=None, operator='<', varNo=0):
        self.number     = number
        self.childOf    = ChildOf
        self.leftTrue   = None
        self.rightFalse = None
        self.value      = value
        self.varNo      = varNo
        self.operator   = operator
    def __call__(self):
        return(self.value)
    def leafNode(self):
        if self.leftTrue is not None and self.rightFalse is not None:
            return(False)
        else:
            return(True)
    def evalCondition(self, x):
        if self.operator == '=':
            cond = x[:, self.varNo] == self.value
        elif self.operator == '<':
            cond = x[:, self.varNo] < self.value
        else: # case >
            cond = x[:, self.varNo] > self.value
        return cond
    def trace(self, x, index=None, traceRoute=None):
        if index is None:
            index = np.arange(len(x))
        if traceRoute is None:
            traceRoute = [[] for x in range(len(x))]
        for k in index:
            traceRoute[k].append(self.number)
        if self.leafNode():
            return (traceRoute, index)
        cond = self.evalCondition(x[index])
        trueIndex  = index[cond]
        falseIndex = index[~cond]
        if self.leftTrue is not None and trueIndex.size != 0:
            traceRoute = self.leftTrue.trace(x, trueIndex, traceRoute)[0]
        if self.rightFalse is not None and falseIndex.size != 0:
            traceRoute =  self.rightFalse.trace(x, falseIndex, traceRoute)[0]
        return (traceRoute, index)
    def conditionString(self):
        if not self.leafNode():
            mystring = "$\\tiny %d \\mathrel{||} x[%d] %s %.2f$" % (self.number, self.varNo, self.operator, self.value)
        else:
            mystring = "$\\tiny %d \\mathrel{||} %.2f$" % (self.number, self.value)
        return(mystring)
    def addMyString(self, ongoingstring, levelsleft=-1, indent=''):
        if levelsleft == 0:
            return ongoingstring
        if not self.leafNode():
            ongoingstring = ongoingstring + indent + "child { node {"+self.conditionString()+" }\n"
        else:
            ongoingstring = ongoingstring + indent + "child { node[fill=gray!30] {"+self.conditionString()+" }\n"
        if self.leftTrue is not None:
            ongoingstring = self.leftTrue.addMyString(ongoingstring, levelsleft-1, indent + '  ')
        if self.rightFalse is not None:
            ongoingstring = self.rightFalse.addMyString(ongoingstring, levelsleft-1, indent + '  ')
        ongoingstring = ongoingstring + indent + "}\n"
           
        return(ongoingstring)
       

if __name__ == '__main__':
    np.random.seed(3)
    bicycleTree = tree(0,1,'=')
    No = bicycleTree.addNode(0,False,1,varNo=1,operator='=')
    bicycleTree.addNode(No,False,0)
    bicycleTree.addNode(No,True,1)
    No = bicycleTree.addNode(0,True,1,varNo=2,operator='=')
    bicycleTree.addNode(No,True,0)
    No = bicycleTree.addNode(No,False,1,varNo=3,operator='=')
    bicycleTree.addNode(No,True,0)
    bicycleTree.addNode(No,False,1)
    import time
    x = np.array([True,False,False,False]).reshape(1,4)
    y = bicycleTree.eval(x)
    traceRoute = bicycleTree.trace(x)
    print(traceRoute)
    print(y)
    x = np.random.randint(2, size=(1000000,4))
    t1 = time.clock()
    y = bicycleTree.eval(x)
    t2 = time.clock()
    print(t2-t1)
    traceRoute = bicycleTree.trace(x)

:::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
def weightedSelfInformation( x ):
    y = 0 if x <= 0 else x*np.log2(x)
    return(y)
def CalConditionalEntropy(y,D,Feature):
    sizeDataBase = D.shape[0]
    D = D.astype(bool)
    TrueFeatureDatabase  = np.sum(D[:,Feature])
    FalseFeatureDatabase = sizeDataBase - TrueFeatureDatabase
    PFeatureTrue  = TrueFeatureDatabase/sizeDataBase
    PFeatureFalse = FalseFeatureDatabase/sizeDataBase
   
    Htrue = 0
    if PFeatureTrue>0:
        P_AB_True  = TrueFeatureDatabase - np.sum(np.logical_and(D[:,Feature],y))
        P_AB_False = TrueFeatureDatabase - P_AB_True
        P_AB_True  = P_AB_True/TrueFeatureDatabase
        P_AB_False = P_AB_False/TrueFeatureDatabase
        Htrue      = PFeatureTrue * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
    Hfalse = 0
    if PFeatureFalse>0:
        P_AB_True  = FalseFeatureDatabase - np.sum(np.logical_and(~D[:,Feature],y))
        P_AB_False = FalseFeatureDatabase - P_AB_True
        P_AB_True  = P_AB_True/FalseFeatureDatabase
        P_AB_False = P_AB_False/FalseFeatureDatabase
        Hfalse     = PFeatureFalse * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
   
    H = -Htrue - Hfalse
    return(H) 
   
dataSet = np.array([[ 1  , 0   ,  0  , 0  , 1 ] , [  0  , 0   ,  0  , 0  , 0 ] ,
                    [ 0  , 0   ,  1  , 0  , 1 ] , [  0  , 0   ,  0  , 0  , 0 ] ,
                    [ 1  , 0   ,  1  , 0  , 1 ] , [  0  , 0   ,  1  , 1  , 1 ] ,
                    [ 1  , 0   ,  0  , 1  , 0 ] , [  1  , 1   ,  0  , 0  , 0 ] ,
                    [ 1  , 1   ,  1  , 0  , 0 ] ])
x = dataSet[:,0:4]
y = dataSet[:,4]
for i in range(4):
    H = CalConditionalEntropy(y,x,i)
    print(H)

  
:::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bDecisionTree:
    def _calGiniImpurity(self,y):
        unique, counts = np.unique(y, return_counts=True)
        N = counts/len(y)
        G = 1 - np.sum(N**2)
        return(G)
    def _bestSplit(self,X,y,feature):
        G = 1
        bestSplit = np.inf
        XSort = np.unique(X[:,feature].round(self.xDecimals)) #*\label{code:CARTunique}
        XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2 #*\label{code:CARTMittelwert}
        for i in range(XDiff.shape[0]):
            index = np.less(X[:,feature], XDiff[i])
            G1 = self._calGiniImpurity(y[index])
            G2 = self._calGiniImpurity(y[~index])
            GSplit = len(y[index])/len(y)*G1 + len(y[~index])/len(y)*G2 #*\label{code:CARTGewichtung}
            if G > GSplit:
                G = GSplit
                bestSplit = XDiff[i]
        return (bestSplit, G)
    def _chooseFeature(self,X,y):
        G         = np.zeros(X.shape[1])
        bestSplit = np.zeros(X.shape[1])
        for i in range(X.shape[1]):
            ( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
        smallest = np.argmin(G) #*\label{code:CARTargmin}
        return (G[smallest], bestSplit[smallest],smallest)
    def _ComputeValue(self,y):
        unique, counts = np.unique(y, return_counts=True)
        i = np.argmax(counts)
        return(unique[i])
    def __init__(self,threshold = 0.1, xDecimals = 8, minLeafNodeSize=3):
        self.bTree = None
        self.threshold = threshold
        self.xDecimals = xDecimals
        self.minLeafNodeSize = minLeafNodeSize
    def _GenTree(self,X,y,parentNode,branch):
        commonValue = self._ComputeValue(y)
        initG = self._calGiniImpurity(y)
        if  initG < self.threshold or X.shape[0] <= self.minLeafNodeSize: #*\label{code:CART-B1Start}
            self.bTree.addNode(parentNode,branch,commonValue)
            return()    #*\label{code:CART-B1End}
           
        (G, bestSplit ,chooseA) = self._chooseFeature(X,y)
        if  G  > 0.98*initG :  #*\label{code:CART-B2Start}
            self.bTree.addNode(parentNode,branch,commonValue)
            return()    #*\label{code:CART-B2End}
       
        if parentNode == None:
            self.bTree = tree(chooseA, bestSplit, '<')
            myNo = 0
        else:
            myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
        index = np.less(X[:,chooseA],bestSplit) #*\label{code:CART-AufteilenStart}
        XTrue  = X[index,:]
        yTrue  = y[index]
        XFalse = X[~index,:]
        yFalse = y[~index] #*\label{code:CART-AufteilenEnd}
               
        if XTrue.shape[0] > self.minLeafNodeSize: #*\label{code:CART-B3True}
            self._GenTree(XTrue,yTrue,myNo,True)
        else:
            commonValue = self._ComputeValue(yTrue)
            self.bTree.addNode(myNo,True,commonValue)
        if XFalse.shape[0] > self.minLeafNodeSize: #*\label{code:CART-B3False}
            self._GenTree(XFalse,yFalse,myNo,False)
        else:
            commonValue = self._ComputeValue(yFalse)
            self.bTree.addNode(myNo,False,commonValue)
        return()
    def fit(self, X,y):
        self._GenTree(X,y,None,None)
   
    def predict(self, X):
        return(self.bTree.eval(X))
   
    def decision_path(self, X):
        return(self.bTree.trace(X))
       
    def weightedPathLength(self,X):
        return(self.bTree.weightedPathLength(X))
       
    def numberOfLeafs(self):
        return(self.bTree.numberOfLeafs())
       
if __name__ == '__main__':       
    fFloat  = open("iris.csv","r")
    dataset = np.loadtxt(fFloat, delimiter=",")
    fFloat.close()
    np.random.seed(42)
    MainSet = np.arange(0,dataset.shape[0])
    Trainingsset = np.random.choice(dataset.shape[0], 120, replace=False)
    Testset = np.delete(MainSet,Trainingsset)
    XTrain = dataset[Trainingsset,:]
    yTrain = dataset[Trainingsset,4]
    XTest = dataset[Testset,:]
    yTest = dataset[Testset,4]
   
    myTree = bDecisionTree()
    myTree.fit(XTrain,yTrain)
   
    yPredict = myTree.predict(XTest)
    print(yPredict - yTest)

:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTree import bRegressionTree
np.random.seed(42)
x = 10*np.random.rand(1000,2)
y = np.zeros(1000)
index = np.flatnonzero(x[:,0]<2 br="">y[index] = 1
index = np.flatnonzero(np.logical_and(x[:,0] >= 2,x[:,1]<5 br="">y[index] = 1
MainSet = np.arange(0,1000)
Trainingsset = np.random.choice(1000, 800, replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = x[Trainingsset,:]
yTrain = y[Trainingsset]
XTest = x[Testset,:]
yTest = y[Testset]
smallTree = bRegressionTree()
smallTree.fit(XTrain,yTrain)
noise = 0.1*np.random.rand(1000) - 0.05
y = y + noise
yTrain = y[Trainingsset]
yTest = y[Testset]
complexTree = bRegressionTree()
complexTree.fit(XTrain,yTrain)
yPredict = complexTree.predict(XTest)
error = np.abs(yPredict - yTest)
print(error.mean())
yPredict = smallTree.predict(XTest)
error = np.abs(yPredict - yTest)
print(error.mean())
ValSet = np.random.choice(800, 200, replace=False)
xVal = XTrain[ValSet]
yVal = yTrain[ValSet]
Trainingsset = np.delete(Trainingsset,ValSet)
XTrain = x[Trainingsset,:]
yTrain = y[Trainingsset]
preTree = bRegressionTree(threshold = 2.5*10**-1)
preTree.fit(XTrain,yTrain)
yPredict = preTree.predict(xVal)
error = np.abs(yPredict - yVal)
print(error.mean())

::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bRegressionTree:
    def _calLRSS(self,y):
        yMean = np.sum(y)/len(y)
        L2 = np.sum( (y-yMean)**2)
        return(L2)
    def _bestSplit(self,X,y,feature):
        RSS = np.inf #*\label{code:RSSinit}
        bestSplit = np.inf
        XSort = np.unique(X[:,feature].round(self.xDecimals))
        XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2
        for i in range(XDiff.shape[0]):
            index = np.less(X[:,feature], XDiff[i])
            if not (np.all(index) or np.all(~index)):
                RSS_1 = self._calLRSS(y[index])
                RSS_2 = self._calLRSS(y[~index])
                RSSSplit = RSS_1 + RSS_2 #*\label{code:CARTJustAdd}
                if RSS > RSSSplit:
                    RSS = RSSSplit
                    bestSplit = XDiff[i]
        return (bestSplit, RSS)
    def _ComputeValue(self,y):
        return(np.sum(y)/len(y))
    def _chooseFeature(self,X,y):
        G         = np.zeros(X.shape[1])
        bestSplit = np.zeros(X.shape[1])
        for i in range(X.shape[1]):
            ( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
        smallest = np.argmin(G)
        return (G[smallest], bestSplit[smallest],smallest)
    def __init__(self,threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3):
        self.bTree = None
        self.threshold = threshold
        self.xDecimals = xDecimals
        self.minLeafNodeSize = minLeafNodeSize
    def _GenTree(self,X,y,parentNode,branch):
        commonValue = self._ComputeValue(y)
        initG = self._calLRSS(y)
        if  initG < self.threshold or X.shape[0] <= self.minLeafNodeSize:
            self.bTree.addNode(parentNode,branch,commonValue)
            return()   
           
        (G, bestSplit ,chooseA) = self._chooseFeature(X,y)
        if  G  > initG :
            self.bTree.addNode(parentNode,branch,commonValue)
            return()   
       
        if parentNode == None:
            self.bTree = tree(chooseA, bestSplit, '<')
            myNo = 0
        else:
            myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
        index = np.less(X[:,chooseA],bestSplit)
        XTrue  = X[index,:]
        yTrue  = y[index]
        XFalse = X[~index,:]
        yFalse = y[~index]
               
        if XTrue.shape[0] > self.minLeafNodeSize:
            self._GenTree(XTrue,yTrue,myNo,True)
        else:
            commonValue = self._ComputeValue(yTrue)
            self.bTree.addNode(myNo,True,commonValue)
        if XFalse.shape[0] > self.minLeafNodeSize:
            self._GenTree(XFalse,yFalse,myNo,False)
        else:
            commonValue = self._ComputeValue(yFalse)
            self.bTree.addNode(myNo,False,commonValue)
        return()
    def fit(self, X,y):
        self._GenTree(X,y,None,None)
   
    def predict(self, X):
        return(self.bTree.eval(X))
   
    def decision_path(self, X):
        return(self.bTree.trace(X))
       
    def weightedPathLength(self,X):
        return(self.bTree.weightedPathLength(X))
       
    def numberOfLeafs(self):
        return(self.bTree.numberOfLeafs())
       
if __name__ == '__main__':       
    np.random.seed(42)
    numberOfSamples = 10000
    X = np.random.rand(numberOfSamples,2)
    Y = ( np.sin(2*np.pi*X[:,0]) + np.cos(np.pi*X[:,1])) * np.exp(1 -X[:,0]**2 -X[:,1]**2 )
   
    MainSet = np.arange(0,X.shape[0])
    Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
    Testset = np.delete(MainSet,Trainingsset)
   
    regressionError = np.zeros(5)
    for i in range(5):
        errorRate = 0.05*i #*\label{code:CARTBeispiel3}
        errorFactor = 1 + 2*(np.random.rand(Trainingsset.shape[0]) - 0.5)*errorRate #*\label{code:CARTBeispiel1}
        XTrain = X[Trainingsset,:]
        yTrain = Y[Trainingsset] * errorFactor #*\label{code:CARTBeispiel2}
        XTest = X[Testset,:]
        yTest = Y[Testset]
       
        myTree = bRegressionTree(xDecimals=3)
        myTree.fit(XTrain,yTrain)
        yPredict = myTree.predict(XTest)
        yDiff = np.abs(yPredict - yTest)
        regressionError[i] = np.mean(yDiff)
   
    import matplotlib.pyplot as plt
    fig1 = plt.figure(1)
    ax = fig1.add_subplot(1,1,1)
    x = np.arange(0,0.25,0.05)
    ax.plot(x,regressionError,'o-',c='k')
    ax.set_xlabel('% Noise')
    ax.set_ylabel('Mean Absolute Error')
   
    from mpl_toolkits.mplot3d import Axes3D
   
    fig2 = plt.figure(2)
    ax = fig2.add_subplot(1,1,1, projection='3d')
    ax.scatter(XTest[:,0],XTest[:,1],yPredict,alpha=0.6,c =yPredict, cmap='gray')
    ax.set_xlabel('x[0]')
    ax.set_ylabel('x[1]')
    ax.set_zlabel('yPredict')


:::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from binaryTree import tree
class bRegressionTree:
    def _calLRSS(self,y):
        yMean = np.sum(y)/len(y)
        L2 = np.sum( (y-yMean)**2)
        return(L2)
    def _bestSplit(self,X,y,feature):
        RSS = np.inf
        bestSplit = np.inf
        XSort = np.unique(X[:,feature].round(self.xDecimals))
        XDiff = (XSort[1:len(XSort)] + XSort[0:len(XSort)-1])/2
        for i in range(XDiff.shape[0]):
            index = np.less(X[:,feature], XDiff[i])
            if not (np.all(index) or np.all(~index)):
                RSS_1 = self._calLRSS(y[index])
                RSS_2 = self._calLRSS(y[~index])
                RSSSplit = RSS_1 + RSS_2
                if RSS > RSSSplit:
                    RSS = RSSSplit
                    bestSplit = XDiff[i]
        return (bestSplit, RSS)
    def _ComputeValue(self,y):
        return(np.sum(y)/len(y))
    def _chooseFeature(self,X,y):
        G         = np.inf*np.ones(X.shape[1]) #*\label{code:RF:0}
        bestSplit = np.zeros(X.shape[1])
        if self.n == 0: #*\label{code:RF:2}
            feature = np.arange(X.shape[1])
        elif self.n == -1:
            feature = np.random.choice(X.shape[1],int(np.sqrt(X.shape[1])),replace=False)
        else:
            feature = np.random.choice(X.shape[1],self.n,replace=False)
        for i in feature: #*\label{code:RF:3}
            ( bestSplit[i] , G[i] ) = self._bestSplit(X,y,i)
        smallest = np.argmin(G) #*\label{code:RF:4}
        return (G[smallest], bestSplit[smallest],smallest)
    def __init__(self,n = 0, threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3):
        self.n = 0
        self.bTree = None
        self.threshold = threshold
        self.xDecimals = xDecimals
        self.minLeafNodeSize = minLeafNodeSize
    def _GenTree(self,X,y,parentNode,branch):
        commonValue = self._ComputeValue(y)
        initG = self._calLRSS(y)
        if  initG < self.threshold or X.shape[0] <= self.minLeafNodeSize:
            self.bTree.addNode(parentNode,branch,commonValue)
            return()   
           
        (G, bestSplit ,chooseA) = self._chooseFeature(X,y)
        if  G  > initG :
            self.bTree.addNode(parentNode,branch,commonValue)
            return()   
       
        if parentNode == None:
            self.bTree = tree(chooseA, bestSplit, '<')
            myNo = 0
        else:
            myNo = self.bTree.addNode(parentNode,branch,bestSplit,operator='<',varNo=chooseA)
        index = np.less(X[:,chooseA],bestSplit)
        XTrue  = X[index,:]
        yTrue  = y[index]
        XFalse = X[~index,:]
        yFalse = y[~index]
               
        if XTrue.shape[0] > self.minLeafNodeSize:
            self._GenTree(XTrue,yTrue,myNo,True)
        else:
            commonValue = self._ComputeValue(yTrue) 
            self.bTree.addNode(myNo,True,commonValue)
        if XFalse.shape[0] > self.minLeafNodeSize:
            self._GenTree(XFalse,yFalse,myNo,False)
        else:
            commonValue = self._ComputeValue(yFalse)
            self.bTree.addNode(myNo,False,commonValue)
        return()
    def fit(self, X,y):
        self._GenTree(X,y,None,None)
   
    def predict(self, X):
        return(self.bTree.eval(X))
   
    def decision_path(self, X):
        return(self.bTree.trace(X))
       
    def weightedPathLength(self,X):
        return(self.bTree.weightedPathLength(X))
       
    def numberOfLeafs(self):
        return(self.bTree.numberOfLeafs())
       
if __name__ == '__main__':       
    np.random.seed(42)
    numberOfSamples = 10000
    X = np.random.rand(numberOfSamples,2)
    Y = ( np.sin(2*np.pi*X[:,0]) + np.cos(np.pi*X[:,1])) * np.exp(1 -X[:,0]**2 -X[:,1]**2 )
   
    MainSet = np.arange(0,X.shape[0])
    Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
    Testset = np.delete(MainSet,Trainingsset)
   
    regressionError = np.zeros(5)
    for i in range(5):
        errorRate = 0.05*i
        errorFactor = 1 + 2*(np.random.rand(Trainingsset.shape[0]) - 0.5)*errorRate
        XTrain = X[Trainingsset,:]
        yTrain = Y[Trainingsset] * errorFactor
        XTest = X[Testset,:]
        yTest = Y[Testset]
       
        myTree = bRegressionTree(xDecimals=3)
        myTree.fit(XTrain,yTrain)
        yPredict = myTree.predict(XTest)
        yDiff = np.abs(yPredict - yTest)
        regressionError[i] = np.mean(yDiff)
   
    import matplotlib.pyplot as plt
    fig1 = plt.figure(1)
    ax = fig1.add_subplot(1,1,1)
    x = np.arange(0,0.25,0.05)
    ax.plot(x,regressionError,'o-')
    ax.set_xlabel('% Noise')
    ax.set_ylabel('Mean Absolute Error')
   
    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib import cm
    fig2 = plt.figure(2)
    ax = fig2.add_subplot(1,1,1, projection='3d')
    ax.scatter(XTest[:,0],XTest[:,1],yPredict,alpha=0.6,c =yPredict, cmap=cm.jet)
    ax.set_xlabel('x[0]')
    ax.set_ylabel('x[1]')
    ax.set_zlabel('yPredict')


::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTree import bRegressionTree
f = open("hourCleanUp.csv")
header = f.readline().rstrip('\n')  # skip the header
featureNames = header.split(',')
dataset = np.loadtxt(f, delimiter=",")
f.close()
X = dataset[:,0:13]
Y = dataset[:,15]
#X = np.delete(X,6, axis=1)
index = np.flatnonzero(X[:,8]==4)
X = np.delete(X,index, axis=0)
Y = np.delete(Y,index, axis=0)
np.random.seed(42)
MainSet = np.arange(0,X.shape[0])
Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
Testset = np.delete(MainSet,Trainingsset)
XTrain = X[Trainingsset,:]
yTrain = Y[Trainingsset]
XTest = X[Testset,:]
yTest = Y[Testset]
myTree = bRegressionTree(minLeafNodeSize=15,threshold=2)
myTree.fit(XTrain,yTrain)
yPredict = np.round(myTree.predict(XTest))
import matplotlib.pyplot as plt
plt.figure(1)
yDiff = yPredict - yTest
plt.hist(yDiff,22,color='gray')
plt.xlim(-200,200)
plt.title('Fehler auf Testdaten')
plt.figure(2)
plt.hist(yTest,22,color='gray')
plt.title('Testdaten')
print('Mittlere Abweichung: %e ' % (np.mean(np.abs(yDiff))))

:::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
def weightedSelfInformation( x ):
    y = 0 if x <= 0 else x*np.log2(x)
    return(y)
def CalConditionalEntropy(y,D,Feature):
    sizeDataBase = D.shape[0]
    D = D.astype(bool)
    TrueFeatureDatabase  = np.sum(D[:,Feature])
    FalseFeatureDatabase = sizeDataBase - TrueFeatureDatabase
    PFeatureTrue  = TrueFeatureDatabase/sizeDataBase
    PFeatureFalse = FalseFeatureDatabase/sizeDataBase
   
    Htrue = 0
    if PFeatureTrue>0:
        P_AB_True  = TrueFeatureDatabase - np.sum(np.logical_and(D[:,Feature],y))
        P_AB_False = TrueFeatureDatabase - P_AB_True
        P_AB_True  = P_AB_True/TrueFeatureDatabase
        P_AB_False = P_AB_False/TrueFeatureDatabase
        Htrue      = PFeatureTrue * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
    Hfalse = 0
    if PFeatureFalse>0:
        P_AB_True  = FalseFeatureDatabase - np.sum(np.logical_and(~D[:,Feature],y))
        P_AB_False = FalseFeatureDatabase - P_AB_True
        P_AB_True  = P_AB_True/FalseFeatureDatabase
        P_AB_False = P_AB_False/FalseFeatureDatabase
        Hfalse     = PFeatureFalse * (weightedSelfInformation(P_AB_False) + weightedSelfInformation(P_AB_True) )
   
    H = -Htrue - Hfalse
    return(H) 
   
dataSet = np.array([[ 1  , 0   ,  0  , 0  , 1 ] , [  0  , 0   ,  0  , 0  , 0 ] ,
                    [ 0  , 0   ,  1  , 0  , 1 ] , [  0  , 0   ,  0  , 0  , 0 ] ,
                    [ 1  , 0   ,  1  , 0  , 1 ] , [  0  , 0   ,  1  , 1  , 1 ] ,
                    [ 1  , 0   ,  0  , 1  , 0 ] , [  1  , 1   ,  0  , 0  , 0 ] ,
                    [ 1  , 1   ,  1  , 0  , 0 ] ])
x = dataSet[:,0:4]
y = dataSet[:,4]
for i in range(4):
    H = CalConditionalEntropy(y,x,i)
    print(H)
from binaryTree import tree
class ID3BinaryTree:
    def __init__(self):
        self.bTree = None
    def _chooseFeature(self,X,y):
        # berechne die bedingte Entropie       
        H = np.zeros(X.shape[1])
        for i in range(len(H)):
            H[i] = CalConditionalEntropy(y,X,i)
        chooseA = np.argmin(H) # Waehle die kleinste bedingte Entropie aus
        return(chooseA)
    def _GenTree(self,X,y,parentNode,branch,A):
        if parentNode == None: # Wurzelknoten muss noch angelegt werden
            A = np.arange(X.shape[1])
        else:
            if len(y) == np.sum(y): # Nur noch positive Faelle vorhanden?
                    self.bTree.addNode(parentNode,branch,True)
                    return()
            elif 0 == np.sum(y):
                    self.bTree.addNode(parentNode,branch,False)
                    return()
            commonValue = True if np.sum(y)>len(y)/2 else False       
            if X.shape[0] == 0: # Keine Merkmale mehr vorhanden?
                self.bTree.addNode(parentNode,branch,commonValue)
                return()
        chooseA = self._chooseFeature(X,y)
           
        if parentNode == None: # Wurzelknoten muss noch angelegt werden
            self.bTree = tree(chooseA, True, '=')
            myNo = 0
        else: # erzeuge neuen Knoten im Baum
            myNo = self.bTree.addNode(parentNode,branch,True, operator='=', varNo=A[chooseA])
       
        # loesche Mermal in X in A
        index  = np.flatnonzero(np.logical_and(X[:,chooseA], 1))
        X = np.delete(X,chooseA,axis=1)
        A = np.delete(A,chooseA,axis=0)
        # teile X auf
        XTrue  = X[index,:]
        yTrue  = y[index]
        XFalse = np.delete(X,index,axis=0)
        yFalse = np.delete(y,index,axis=0)
        if XTrue.shape[0]>0:
            self._GenTree(XTrue,yTrue,myNo,True,A)
        else:
            self.bTree.addNode(myNo,True,commonValue)
        if XFalse.shape[0]>0:
            self._GenTree(XFalse,yFalse,myNo,False,A)
        else:
            self.bTree.addNode(myNo,False,commonValue)
        return()
    def fit(self, X,y):
        self._GenTree(X,y,None,None,None)
   
    def predict(self, X):
        return(self.bTree.eval(X))
   
    def decisionPath(self, X):
        return(self.bTree.trace(X))
       
    def weightedPathLength(self,X):
        return(self.bTree.weightedPathLength(X))
       
    def numberOfLeafs(self):
        return(self.bTree.numberOfLeafs())
       
   

myTree = ID3BinaryTree()
myTree.fit(x,y)

::::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
from CARTRegressionTreeRF import bRegressionTree
class randomForestRegression:
    def __init__(self,noOfTrees=10,threshold = 10**-8, xDecimals = 8, minLeafNodeSize=3, perc=1):
        self.perc = perc
        self.threshold = threshold
        self.xDecimals = xDecimals
        self.minLeafNodeSize = minLeafNodeSize
        self.bTree = []
        self.noOfTrees = noOfTrees
        for i in range(noOfTrees):
            tempTree = bRegressionTree(threshold = self.threshold, xDecimals = self.xDecimals , minLeafNodeSize=self.minLeafNodeSize)
            self.bTree.append(tempTree)
           
    def fit(self,X,y):
        self.samples = []
        for i in range(self.noOfTrees):
            bootstrapSample = np.random.randint(X.shape[0],size=int(self.perc*X.shape[0]))
            self.samples.append(bootstrapSample)     #*\label{code:realRF:0}
            bootstrapX = X[bootstrapSample,:]
            bootstrapY = y[bootstrapSample]
            self.bTree[i].fit(bootstrapX,bootstrapY)
   
    def predict(self,X):
        ypredict = np.zeros(X.shape[0])
        for i in range(self.noOfTrees):
            ypredict += self.bTree[i].predict(X)
        ypredict = ypredict/self.noOfTrees
        return(ypredict)
       
if __name__ == '__main__':  
    f = open("hourCleanUp.csv") #*\label{code:realRF:1}
    header = f.readline().rstrip('\n') 
    featureNames = header.split(',')
    dataset = np.loadtxt(f, delimiter=",")
    f.close()
   
    X = dataset[:,0:13]
    Y = dataset[:,15]
   
    index = np.flatnonzero(X[:,8]==4)
    X = np.delete(X,index, axis=0)
    Y = np.delete(Y,index, axis=0)
   
    np.random.seed(42)
    MainSet = np.arange(0,X.shape[0])
    Trainingsset = np.random.choice(X.shape[0], int(0.8*X.shape[0]), replace=False)
    Testset = np.delete(MainSet,Trainingsset)
    XTrain = X[Trainingsset,:]
    yTrain = Y[Trainingsset]
    XTest = X[Testset,:]
    yTest = Y[Testset] #*\label{code:realRF:2}
   
    myForest = randomForestRegression(noOfTrees=24,minLeafNodeSize=5,threshold=2)
    myForest.fit(XTrain,yTrain)
    yPredict = np.round(myForest.predict(XTest))
    yDiff = yPredict - yTest
    print('Mittlere Abweichung: %e ' % (np.mean(np.abs(yDiff))))
               
:::::::::::::::::::::::::::::::::::::::::::::::::




Keine Kommentare:

Kommentar veröffentlichen

Hinweis: Nur ein Mitglied dieses Blogs kann Kommentare posten.