from QLearningEnv import boxEnvironment, robotAgent
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l2
class learningAgent():
def __init__(self,tau=1, memoryMax= 1000000, alpha=0.5, gamma=0.7, updateFeq = 500, badMemory= 3000):
self.alpha = alpha
self.gamma = gamma
self.tau = tau
self.a = np.array([-2,0,2])
self.updateFeq = updateFeq
self.memoryMax = memoryMax
self.memoryCounter = -1
self.badMemory = badMemory
self.initPhase = True
def setSensors(self, number=5):
self.distanceSensorValue= np.zeros(max(number,5))
self.dirtSensor = np.array([0])
if self.memoryCounter<0: br=""> self.MemoryStates = np.inf*np.ones( (self.memoryMax,number+2) )
self.MemoryActions = np.inf*np.ones(self.memoryMax)
self.FeedbackDirt = np.inf*np.ones(self.memoryMax)
self.FeedbackCol = np.zeros(self.memoryMax)
self.Rewards = np.zeros(self.memoryMax)0:>
def _dumpSensorsData(self):
if self.memoryCounter<1: br=""> dDiff = 0
else:
dDiff = self.distanceSensorValue[2]/10 - self.MemoryStates[self.memoryCounter-1,2]
dDiff = dDiff*10
temp = np.hstack( (self.distanceSensorValue/10,self.dirtSensor*10**1,dDiff) )
self.MemoryStates[self.memoryCounter,:] = temp1:>
def _dumpActionData(self,w):
self.MemoryActions[self.memoryCounter] = w
def collisionFeedback(self, yesOrNo):
self.FeedbackCol[self.memoryCounter] = int(yesOrNo)
if np.min(self.MemoryStates[self.memoryCounter,:]) > 0.005:
self.FeedbackCol[self.memoryCounter] = False
def dirtCleaned(self,dirt):
if self.memoryCounter<0: br="" return=""> self.FeedbackDirt[self.memoryCounter] = dirt
def calcReward(self):
self.Rewards[self.memoryCounter] = 0.05
self.Rewards[self.memoryCounter] += 10**4*self.FeedbackDirt[self.memoryCounter]
self.Rewards[self.memoryCounter] -= self.FeedbackCol[self.memoryCounter]*10
self.Rewards[self.memoryCounter] -= 10*np.exp(- 5*self.distanceSensorValue[2])
temp1 = 10*np.exp(- 5*self.distanceSensorValue[4])
temp2 = 10*np.exp(- 5*self.distanceSensorValue[0])
if self.MemoryActions[self.memoryCounter] > 0.001 :
self.Rewards[self.memoryCounter] -= temp1
elif self.MemoryActions[self.memoryCounter] < -0.001:
self.Rewards[self.memoryCounter] -= temp2
else:
self.Rewards[self.memoryCounter] -= (temp1 + temp2)/2
def _initQ(self):
XTrain = np.hstack( (self.MemoryStates[0:self.memoryCounter,:],self.MemoryActions[0:self.memoryCounter,None]))
self.Q = Sequential()
self.Q.add(Dense(256, input_dim=XTrain.shape[1], activation='relu',kernel_regularizer=l2(0.0001)))
self.Q.add(Dense(128, activation='relu',kernel_regularizer=l2(0.0001)))
self.Q.add(Dense(64, activation='relu',kernel_regularizer=l2(0.0001)))
self.Q.add(Dense(1, activation='linear'))
clist = [keras.callbacks.EarlyStopping(monitor='loss',patience=5,verbose=0,mode='auto')]
self.Q.compile(loss='mean_squared_error', optimizer='adam')
self.Q.fit(XTrain,self.alpha*self.Rewards[0:XTrain.shape[0]], epochs=50, callbacks=clist, batch_size=5, verbose=False)
def _updateQ(self):
learnSet = np.arange(0,self.memoryCounter-1)
index1 = np.flatnonzero(self.Rewards[learnSet]<-0 .9="" br=""> if index1.shape[0]>self.badMemory:
index = np.random.choice(index1.shape[0], self.badMemory, replace=False)
index1 = index1[index]
samplesleft = min(index1.shape[0]*5,self.memoryCounter-index1.shape[0]-1)
index2 = np.random.choice(self.memoryCounter-1, samplesleft, replace=False)
learnSet = np.hstack( (index1[None,:],index2[None,:]) )
learnSet = np.reshape(learnSet, learnSet.shape[1])
X = np.hstack( (self.MemoryStates[learnSet,:],self.MemoryActions[learnSet,None]))-0>0:>
Qalt = np.squeeze(self.Q.predict(X))
QChooose = np.zeros( (Qalt.shape[0],len(self.a)) )
for i in range(len(self.a)): #*\label{code:qlearning:agent:0}
a = self.a[i]*np.ones_like(self.MemoryActions[learnSet,None])
Xtemp = np.hstack( (self.MemoryStates[learnSet+1,:],a))
QChooose[:,i] = np.squeeze(self.Q.predict(Xtemp)) #*\label{code:qlearning:agent:1}
Qmax = np.max(QChooose,axis=1) #*\label{code:qlearning:agent:2}
r = self.Rewards[learnSet]
QNeu = (1-self.alpha)*Qalt+self.alpha*(r + self.gamma*Qmax) #*\label{code:qlearning:agent:3}
self.Q.fit(X,QNeu, epochs=3, batch_size=5, verbose=False) #*\label{code:qlearning:agent:4}
def _softmax(self,Qa):
return np.exp(Qa/self.tau) / np.sum(np.exp(Qa/self.tau))
def chooseAction(self,Qa):
toChoose = np.arange(0,len(Qa))
pW = self._softmax(Qa)
if np.any(np.isnan(pW)) or np.any(np.isinf(pW)):
choosenA = np.random.randint(0,len(Qa))
else:
choosenA = np.random.choice(toChoose,replace=False, p=pW)
return(choosenA)
def changeAngularVelocity(self):
self.calcReward() #*\label{code:qlearning:agent:112}
self.memoryCounter += 1
if self.memoryCounter%self.updateFeq == 0 and not self.initPhase:
self._updateQ()
if self.memoryCounter > min(500,self.memoryMax-1) and self.initPhase:
self.initPhase = False
self._initQ()
self._updateQ()
self._dumpSensorsData() #*\label{code:qlearning:agent:13}
if self.initPhase: #*\label{code:qlearning:agent:5}
w = 4*(np.random.rand(1) - 0.5)
if self.distanceSensorValue[1] < 0.75 or self.distanceSensorValue[3] < 0.75:
w = 1.0 if self.distanceSensorValue[1] < self.distanceSensorValue[3] else -1.0
if self.distanceSensorValue[0] < 0.5 or self.distanceSensorValue[4] < 0.5:
w = 1.5 if self.distanceSensorValue[0] < self.distanceSensorValue[4] else -1.5
if self.distanceSensorValue[2] < 0.5 : w = -2 #*\label{code:qlearning:agent:6}
else:
Qa = np.zeros_like(self.a)
j = self.MemoryStates[self.memoryCounter,:].shape[0]
x = np.zeros( (1, j+1) )
x[0,0:j] = self.MemoryStates[self.memoryCounter,:] #*\label{code:qlearning:agent:7}
for i in range(len(self.a)):
x[0,self.MemoryStates[self.memoryCounter,:].shape[0]] = self.a[i] #*\label{code:qlearning:agent:8}
Qa[i] = self.Q.predict(x)[0] #*\label{code:qlearning:agent:9}
ca = self.chooseAction(Qa) #*\label{code:qlearning:agent:10}
w = self.a[ca] #*\label{code:qlearning:agent:11}
self._dumpActionData(w) #*\label{code:qlearning:agent:12}
return(w)
def CoopData(self,d,theta1,theta2):
pass
# def addOldMemory(self,States,Actions,FeedbackDirt,FeedbackCol,counter):
# self.setSensors(number=States.shape[1]-1)
# if self.memoryCounter < 0 : self.memoryCounter = 0
# self.MemoryStates[self.memoryCounter:self.memoryCounter+counter,:] = np.copy(States[0:counter,:])
# self.MemoryActions[self.memoryCounter:self.memoryCounter+counter] = np.copy(Actions[0:counter])
# self.FeedbackDirt[self.memoryCounter:self.memoryCounter+counter] = np.copy(FeedbackDirt[0:counter])
# self.FeedbackCol[self.memoryCounter:self.memoryCounter+counter] = np.copy(FeedbackCol[0:counter])
# self.memoryCounter += counter-1
# self.calcReward(recomputeAll=True)
if __name__ == '__main__':
np.random.seed(42)
jack = learningAgent(tau=0.3, gamma=0.8, alpha=0.4,badMemory= 10000)
jack.updateFeq = 500
for i in range(7):
print('new world:',i)
myNewBox = boxEnvironment()
myNewBox.addRobot(jack)
myNewBox.startSim(300)
myNewBox.evaluation()
myNewBox.plotScene()
:::::::::::::::::::::::::::::::::::::::::::
import numpy as np
delta = np.zeros( (9,9) )
delta[3,6] = delta[3,0] = delta[2,1] = delta[1,0] = delta[0,1] = delta[0,3] = 0.5
delta[6,3] = delta[5,8] = delta[7,6] = delta[7,8] = 0.5
delta[4,4] = delta[2,2] = 1
delta[8,7] = delta[8,5] = delta[6,7] = delta[4,5] = delta[4,7] = delta[2,5] = 1/3
s = np.zeros( (9,1) )
s[0] = 1
sNew = np.linalg.matrix_power(delta,5)@s
sNew = np.linalg.matrix_power(delta,100)@s
print(sNew)
s = np.zeros( (9,1) )
s[6] = 1
sNew = np.linalg.matrix_power(delta,100)@s
print(sNew)
:::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.patches import Rectangle, Arrow
class robotAgent():
def __init__(self):
pass
def CoopData(self,X,Y,nX,nY):
pass
def collisionFeedback(self, yesOrNo):
pass
def dirtCleaned(self,dirt):
pass
def setSensors(self, number=5):
self.distanceSensorValue= np.zeros(max(number,5))
self.dirtSensor = np.array([0])
def changeAngularVelocity(self):
w = 0 if self.dirtSensor > 0.001 else 2*np.sign(np.random.rand(1)-0.5)
if self.distanceSensorValue[1] < 3.0 or self.distanceSensorValue[3] < 3.0:
w = 1.0 if self.distanceSensorValue[1] < self.distanceSensorValue[3] else -1.0
if self.distanceSensorValue[0] < 1.0 or self.distanceSensorValue[4] < 1.0:
w = 1.5 if self.distanceSensorValue[0] < self.distanceSensorValue[4] else -1.5
if self.distanceSensorValue[2] < 1.25 : w = -2
return(w)
class boxEnvironment():
def __init__(self):
self.rAgents=[] #*\label{code:qlearning:ann:1}
self.rAgentsX=[]; self.rAgentsY=[]
self.rAgentsTheta=[]; self.rAgentsW=[]
self.rAgentsR=[]
self.rAgentsCol=[]
self.rAgentsDirt=[]
self.v = [] #*\label{code:qlearning:ann:2}
self.dt = 0.01 #*\label{code:qlearning:ann:3}
self.xBox = [-4, 4] #*\label{code:qlearning:ann:4}
self.yBox = [-3, 3] #*\label{code:qlearning:ann:5}
self.sensors = np.array([-np.pi/6,-np.pi/3,0,np.pi/3,np.pi/6 ]) #*\label{code:qlearning:ann:sensors}
x = np.arange(self.xBox[0]+0.05,self.xBox[1],0.05) #*\label{code:qlearning:dirt:1}
y = np.arange(self.yBox[0]+0.05,self.yBox[1],0.05)
xv, yv = np.meshgrid(x, y)
self.xv = np.resize(xv,(xv.size,)); self.yv = np.resize(yv,(yv.size,))
self.dirt = np.ones_like(self.xv) #*\label{code:qlearning:dirt:2}
tisch = robotAgent() #*\label{code:qlearning:ann:6}
self.addRobot(tisch, r=1.75)
self.rAgentsX[0] = 0
self.rAgentsY[0] = 0
self.v[0] = 0 #*\label{code:qlearning:ann:7}
def addRobot(self,robot, v=0.5, r=0.25):
robot.setSensors(len(self.sensors))
self.rAgents.append(robot) #*\label{code:qlearning:ann:8}
self.rAgentsR.append(r)
self.v.append(v) #*\label{code:qlearning:ann:11}
self.rAgentsCol.append(0) #*\label{code:qlearning:ann:9}
self.rAgentsY.append( (len(self.rAgents)-1)*0.5 - 3.0 ) #*\label{code:qlearning:ann:10}
lr = 1 if len(self.rAgents)%2 else -1
self.rAgentsX.append(lr*3) #*\label{code:qlearning:ann:12}
self.rAgentsTheta.append(2*np.pi*np.random.rand(1)) #*\label{code:qlearning:ann:62}
self.rAgentsDirt.append(0)
def _moveRobots(self):
for i in range(len(self.rAgents)):
w = np.clip(self.rAgents[i].changeAngularVelocity(),-2,2) #*\label{code:qlearning:ann:13}
self.rAgentsX[i] += self.dt*self.v[i] * np.cos(self.rAgentsTheta[i]) #*\label{code:qlearning:ann:14}
self.rAgentsY[i] += self.dt*self.v[i] * np.sin(self.rAgentsTheta[i])
self.rAgentsTheta[i] += self.dt*w #*\label{code:qlearning:ann:15}
if self.rAgentsTheta[i] > 2*np.pi : self.rAgentsTheta[i] -= 2*np.pi #*\label{code:qlearning:ann:16}
if self.rAgentsTheta[i] < 0 : self.rAgentsTheta[i] += 2*np.pi #*\label{code:qlearning:ann:17}
def _collisionCheck(self):
self._checkWalls()
self._checkRobots()
def _checkWalls(self):
for i in range(len(self.rAgents)):
checkCol = False
if self.rAgentsX[i] - self.rAgentsR[i] < self.xBox[0]:
checkCol = True
self.rAgentsX[i] = self.xBox[0] + self.rAgentsR[i] + 0.01
elif self.rAgentsX[i] + self.rAgentsR[i] > self.xBox[1]:
checkCol = True
self.rAgentsX[i] = self.xBox[1] - self.rAgentsR[i] - 0.01
elif self.rAgentsY[i] - self.rAgentsR[i] < self.yBox[0]:
checkCol = True
self.rAgentsY[i] = self.yBox[0] + self.rAgentsR[i] + 0.01
elif self.rAgentsY[i] + self.rAgentsR[i] > self.yBox[1]:
checkCol = True
self.rAgentsY[i] = self.yBox[1] - self.rAgentsR[i] - 0.01
if checkCol:
self.rAgents[i].collisionFeedback(True) #*\label{code:qlearning:ann:18}
self.rAgentsCol[i] += 1 #*\label{code:qlearning:ann:19}
self.rAgentsTheta[i] += np.pi #*\label{code:qlearning:ann:20}
if self.rAgentsTheta[i] > 2*np.pi : self.rAgentsTheta[i] -= 2*np.pi
def _checkRobots(self):
self.minDist = -1*np.ones(len(self.rAgents), dtype=int) #*\label{code:qlearning:ann:addon:0}
self.minDistVal = np.inf*np.ones(len(self.rAgents)) #*\label{code:qlearning:ann:addon:1}
for i in range(len(self.rAgents)):
for j in range(i+1,len(self.rAgents)): #*\label{code:qlearning:ann:23}
dist = (self.rAgentsX[i] - self.rAgentsX[j])**2 #*\label{code:qlearning:ann:21}
dist += (self.rAgentsY[i] - self.rAgentsY[j])**2 #*\label{code:qlearning:ann:22}
if self.minDistVal[j] > dist and i>0: #*\label{code:qlearning:ann:addon:2}
self.minDistVal[i] = dist
self.minDistVal[j] = dist
self.minDist[i] = j
self.minDist[j] = i
if np.sqrt(dist) < self.rAgentsR[i] + self.rAgentsR[j]:
mylist = [i,j]
for k in mylist:
self.rAgents[k].collisionFeedback(True) #*\label{code:qlearning:ann:24}
self.rAgentsCol[k] += 1
self.rAgentsTheta[k] += 0.75*np.pi
if self.rAgentsTheta[k] > 2*np.pi : self.rAgentsTheta[k] -= 2*np.pi
self.rAgentsX[k] += 2*self.dt*self.v[k] * np.cos(self.rAgentsTheta[k])
self.rAgentsY[k] += 2*self.dt*self.v[k] * np.sin(self.rAgentsTheta[k])
def _setDistanceSensors(self):
for i in range(len(self.rAgents)): #*\label{code:qlearning:ann:27}
self.rAgents[i].distanceSensorValue[:] = np.Inf #*\label{code:qlearning:ann:25}
s = np.array([self.rAgentsX[i],self.rAgentsY[i]]) #*\label{code:qlearning:ann:26}
for j in range(len(self.rAgents)): #*\label{code:qlearning:ann:28}
if i == j : continue #*\label{code:qlearning:ann:29}
m = np.array([self.rAgentsX[j],self.rAgentsY[j]]) #*\label{code:qlearning:ann:30}
dist = self._intersectionR2R(s,self.rAgentsTheta[i],m,self.rAgentsR[j]) #*\label{code:qlearning:ann:31}
for k in range(len(dist)): #*\label{code:qlearning:ann:32}
if dist[k] - self.rAgentsR[i]
dist = self._sensorR2Wall(s,self.rAgentsTheta[i])
for k in range(len(dist)):
if dist[k] - self.rAgentsR[i]
def _rotateVector(self,phi):
return [np.cos(phi), np.sin(phi)]
def _sensorR2Wall(self,s,theta):
dist = []
for k in self.sensors:
d = self._rotateVector(theta+k)
if d[0] < 0: tempX = 1/d[0]*(self.xBox[0] - s[0])
elif d[0] > 0: tempX = 1/d[0]*(self.xBox[1] - s[0])
else: tempX = np.inf
if d[1] < 0: tempY = 1/d[1]*(self.yBox[0] - s[1])
elif d[1] > 0: tempY = 1/d[1]*(self.yBox[1] - s[1])
else: tempY = np.inf
dist.append(min(tempX,tempY))
return(dist)
def _intersectionR2R(self,s,theta,m,r):
dist = []
for k in self.sensors:
d = self._rotateVector(theta+k)
a = s[0] - m[0]
b = s[1] - m[1]
term1 = a*d[0]+b*d[1]
Discriminant = term1**2 - (a**2+b**2-r**2)
if Discriminant >= 0:
lambda2 = -term1-np.sqrt(Discriminant)
if lambda2 > 0: dist.append(lambda2)
else: dist.append(np.Inf)
else: dist.append(np.Inf)
return(dist)
def _setDirtSensor(self):
for i in range(len(self.rAgents)):
d = self._rotateVector(self.rAgentsTheta[i])
x = self.rAgentsX[i] + 0.25*d[0] #*\label{code:qlearning:ann:33}
y = self.rAgentsX[i] + 0.25*d[0] #*\label{code:qlearning:ann:34}
t = (self.xv-x)**2 + (self.yv-y)**2 < self.rAgentsR[i]**2 #*\label{code:qlearning:ann:35}
dirtSum = np.sum(self.dirt[t]) #*\label{code:qlearning:ann:36}
self.rAgents[i].dirtSensor = 10*dirtSum/self.dirt.size #*\label{code:qlearning:ann:37}
def cleanDirt(self):
for i in range(len(self.rAgents)):
t=(self.xv-self.rAgentsX[i])**2+(self.yv-self.rAgentsY[i])**2
self.dirt[t] -= 0.9*self.dirt[t]
self.rAgents[i].dirtCleaned(dirtSum/self.dirt.size)
self.rAgentsDirt[i] += dirtSum/self.dirt.size
def _setCoopData(self):
for i in range(len(self.rAgents)):
j = self.minDist[i]
self.rAgents[i].CoopData(self.rAgentsX[i], self.rAgentsY[i], self.rAgentsX[j], self.rAgentsY[j])
def nextTimeStep(self):
self._setDistanceSensors()
self._setDirtSensor()
self._moveRobots()
self._collisionCheck()
self._setCoopData()
self.cleanDirt()
self.dirt += self.dt/300; self.dirt = np.minimum(self.dirt,1)
def startSim(self,EndTime):
for i in range(len(self.rAgents)):
t=(self.xv-self.rAgentsX[i])**2+(self.yv-self.rAgentsY[i])**2
self.EndTime = EndTime
tau = np.arange(0,EndTime,self.dt)
self.protocol = []
for i in range(len(self.rAgents)):
self.protocol.append( np.zeros( (tau.shape[0],5) ) )
self.rAgents[i].dirtSensor=0
self.protocol.append(np.zeros(tau.shape[0]))
for j in range(len(tau)):
self.nextTimeStep()
for i in range(len(self.rAgents)):
self.protocol[i][j,0] = self.rAgentsX[i]
self.protocol[i][j,1] = self.rAgentsY[i]
self.protocol[i][j,2] = self.rAgentsCol[i]
self.protocol[i][j,3] = self.rAgentsTheta[i]
self.protocol[i][j,4] = self.rAgentsDirt[i]
self.protocol[len(self.rAgents)][j] = np.sum(self.dirt)/self.dirt.size
def evaluation(self):
lines = ["-","--","-.",":"]
tau = np.arange(0,self.EndTime,self.dt)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
for i in range(1,len(self.rAgents)):
mystr = 'Agent No.'+str(i)
ax.plot(tau,self.protocol[i][:,4],label=mystr,linestyle=lines[i%4],color='k')
ax.plot(tau,self.protocol[len(self.rAgents)],label='Dirt (total)', color='r', lw=2.0)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
ax.set_title('Dirtying')
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
for i in range(1,len(self.rAgents)):
mystr = 'Agent No.'+str(i)
ax.plot(tau,self.protocol[i][:,2],label=mystr,linestyle=lines[i%4],color='k')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
ax.set_title('Collisions')
from matplotlib import cm
fig = plt.figure()
ax = fig.add_subplot(111)
x = np.arange(self.xBox[0]+0.05,self.xBox[1],0.05)
y = np.arange(self.yBox[0]+0.05,self.yBox[1],0.05)
xv, yv = np.meshgrid(x, y)
z = np.resize(self.dirt, xv.shape)
ax.pcolormesh(xv, yv, z, cmap=cm.Greys)
plt.show()
def _initScene(self):
self.toDraw =[]
borders = Rectangle((self.xBox[0], self.yBox[0]), self.xBox[1] - self.xBox[0],
self.yBox[1] - self.yBox[0], fill=True, facecolor='white', edgecolor='k' )
self.toDraw.append(borders)
table = plt.Circle((self.rAgentsX[0],self.rAgentsX[0]), self.rAgentsR[0], fc='k')
self.toDraw.append(table)
for i in range(1,len(self.rAgents)):
agents = plt.Circle((self.rAgentsX[i],self.rAgentsY[i]), self.rAgentsR[i], fc=np.random.rand(3))
self.toDraw.append(agents)
return []
def _updateScene(self, i):
i=min(i*20,len(self.protocol[0][:,0])) + self.startFrame
dirtyTrick = Rectangle((-4.5,-3.5), 9, 7, fill=True, facecolor='white', edgecolor=None )
self.ax.add_patch(dirtyTrick)
self.ax.add_patch(self.toDraw[0])
for j in range(1,len(self.rAgents)+1):
x = self.protocol[j-1][i,0]; y = self.protocol[j-1][i,1]
self.toDraw[j].center = (x,y)
self.ax.add_patch(self.toDraw[j])
for j in range(2,len(self.rAgents)+1):
x = self.protocol[j-1][i,0]; y = self.protocol[j-1][i,1]
sensors = np.array([-np.pi/6,-np.pi/3,0,np.pi/3,np.pi/6 ])
for k in sensors:
d = self._rotateVector(self.protocol[j-1][i,3]+k)
lookDir = Arrow(x,y,0.5*d[0],0.5*d[1], width=0.2)
self.ax.add_patch(lookDir)
return []
def plotScene(self, startFrame=0):
self.startFrame = startFrame
self.fig = plt.figure()
self.ax = plt.axes(xlim=(-4.5, 4.5), ylim=(-3.5, 3.5))
self.fig.show()
self.anim = animation.FuncAnimation(self.fig, self._updateScene,
init_func=self._initScene, frames=len(self.protocol[0][:,0]),
interval=20, repeat=False)
self.fig.show()
if __name__ == '__main__':
np.random.seed(42)
myNewBox = boxEnvironment()
for i in range(1):
reaktiverAgent = robotAgent()
myNewBox.addRobot(reaktiverAgent)
from time import clock
t1 = clock()
myNewBox.startSim(300)
t2 = clock()
print(t2-t1)
myNewBox.evaluation()
#myNewBox.plotScene()
::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
class environment:
def __init__(self):
self.envData = -1*np.ones( (5,4) ) #*\label{code:robocliff:1}
self.envData[1,3] = self.envData[2,3] = self.envData[3,3] = -1000 #*\label{code:robocliff:2}
self.envData[4,3] = 1000 #*\label{code:robocliff:3}
self.dogX = 0; self.dogY = 3 #*\label{code:robocliff:4}
def setDog(self, x=0, y=3):
self.dogX = x; self.dogY = y #*\label{code:robocliff:8}
def move(self,a):
if a == 0 : self.dogY -= 1 # a = 0 east
if a == 1 : self.dogX += 1 # a = 1 south
if a == 2 : self.dogY += 1 # a = 2 west
if a == 3 : self.dogX -= 1 # a = 3 north
if self.dogY >3 or self.dogY <0 or="" self.dogx="">4 or self.dogX <0: br="" code:robocliff:5="" label=""> r = -10
else:
r = self.envData[self.dogX,self.dogY]
if self.dogY <0: br="" code:robocliff:6="" label="" self.dogy="0"> if self.dogX <0: self.dogx="0<br"> if self.dogX >4: self.dogX = 4
if self.dogY >3: self.dogY = 3 #*\label{code:robocliff:7}
return(r)
def sensors(self):
return(self.dogX,self.dogY) #*\label{code:robocliff:8}
class learningDog:
def __init__(self, env,vareps):
self.hatQ = np.zeros( (5,4,4) ) #*\label{code:robocliff:9}
self.env = env
self.vareps = vareps
def _chooseAction(self,x,y):
RonnieSoak = np.random.rand(1)
if RonnieSoak < self.vareps:
a = np.random.randint(4)
else:
localQ = self.hatQ[x,y,:]
a = np.argmax(localQ)
return(a)
def learn(self, gamma, trainCycles):
for i in range(trainCycles):
(x,y) = self.env.sensors()
a = self._chooseAction(x,y)
r = self.env.move(a)
(xN,yN) = self.env.sensors()
localQ = self.hatQ[xN,yN,:]
self.hatQ[x,y,a] = r + gamma * np.max(localQ) #*\label{code:robocliff:53}
if r == 1000: self.env.setDog() #*\label{code:robocliff:10}
def show(self, x=0, y=3):
self.env.setDog(x,y)
r = -1
counter = 0
while r != 1000 and counter < 100:
counter += 1
(x,y) = self.env.sensors()
a = self._chooseAction(x,y)
r = self.env.move(a)
print(a,r)0:>0:>0:>0>
if __name__ == '__main__':
np.random.seed(42)
myenv = environment()
robohund = learningDog(myenv,0.1)
robohund.learn(0.4,2000)
print('Start bei 0,3')
robohund.show()
print('Start bei 3,0')
robohund.show(x=3,y=0)
:::::::::::::::::::::::::::::::::::::::::::::::::
import numpy as np
class environment:
def __init__(self):
self.envData = -1*np.ones( (5,4) )
self.envData[1,3] = self.envData[2,3] = self.envData[3,3] = -1000
self.envData[4,3] = 1000
self.dogX = 0; self.dogY = 3
def setDog(self, x=0, y=3):
self.dogX = x; self.dogY = y
def move(self,a):
if a == 0 : self.dogY -= 1 # a = 0 east
if a == 1 : self.dogX += 1 # a = 1 south,
if a == 2 : self.dogY += 1 # a = 2 west
if a == 3 : self.dogX -= 1 # a = 3 north
if self.dogY >3 or self.dogY <0 or="" self.dogx="">4 or self.dogX <0: br=""> r = -10
else:
r = self.envData[self.dogX,self.dogY]
if self.dogY <0: self.dogy="0<br"> if self.dogX <0: self.dogx="0<br"> if self.dogX >4: self.dogX = 4
if self.dogY >3: self.dogY = 3
return(r)
def sensors(self):
return(self.dogX,self.dogY)
class learningDogSARSA:
def __init__(self, env,vareps):
self.hatQ = np.zeros( (5,4,4) ) # initialisiere hatQ(s,a)
self.env = env
self.vareps = vareps
def _chooseAction(self,x,y):
RonnieSoak = np.random.rand(1)
if RonnieSoak < self.vareps:
a = np.random.randint(4)
else:
localQ = self.hatQ[x,y,:]
a = np.argmax(localQ)
return(a)
def learn(self, gamma, trainCycles):
for i in range(trainCycles):
(x,y) = self.env.sensors()
a = self._chooseAction(x,y)
r = self.env.move(a)
(xN,yN) = self.env.sensors()
rNext = self.hatQ[xN,yN,self._chooseAction(xN,yN)]
self.hatQ[x,y,a] = r + gamma * rNext
if r == 1000: self.env.setDog()
def show(self, x=0, y=3):
self.env.setDog(x,y)
r = -1
counter = 0
while r != 1000 and counter < 100:
counter += 1
(x,y) = self.env.sensors()
a = self._chooseAction(x,y)
r = self.env.move(a)
print(a,r)0:>0:>0:>0>
if __name__ == '__main__':
np.random.seed(42)
myenv = environment()
robohund = learningDogSARSA(myenv,0.1)
robohund.learn(0.4,2000)
print('Start bei 0,3')
robohund.show()
print('Start bei 3,0')
robohund.show(x=3,y=0)
:::::::::::::::::::::::::::::::::::::::::
import numpy as np
import random
def softmax(Q, tau):
return np.exp(Q/tau) / sum(np.exp(Q/tau))
Q=np.array([-10, 2.1, 1.9])
print( softmax(Q,0.1) )
print( softmax(Q,1.0) )
print( softmax(Q,100) )
def chooseAction(Q, tau=1):
p = softmax(Q,1.0)
toChoose = np.arange(0,len(Q))
a = int(random.choices(toChoose,weights=p,k=1)[0])
return(a)
random.seed(42)
counter = np.zeros(len(Q))
for i in range(10000):
counter[chooseAction(Q)] += 1
print(counter)
:::::::::::::::::::::::::::::::::::::
Keine Kommentare:
Kommentar veröffentlichen
Hinweis: Nur ein Mitglied dieses Blogs kann Kommentare posten.