xxxxxxxxxx6
1
begin2
using ReinforcementLearning3
using Flux4
using Statistics5
using Plots6
endTo describe the Grid World in Example 3.5, we'll create a distributional environment model. Here the distributional means, given a state-action paire, we can predict the possible next state, reward, termination info and the corresponding probability.
xxxxxxxxxx35
1
begin2
function nextstep(s::CartesianIndex{2}, a::CartesianIndex{2})3
if s == CartesianIndex(1, 2)4
r, s′ = 10., CartesianIndex(5, 2)5
elseif s == CartesianIndex(1, 4)6
r, s′ = 5., CartesianIndex(3, 4)7
else8
s′ = s + a9
if 1 ≤ s′[1] ≤ 5 && 1 ≤ s′[2] ≤ 510
r = 0.11
else12
r = -1.13
s′ = s14
end15
end16
[(r, false, LinearIndices((5,5))[s′]) => 1.0]17
end18
19
ACTIONS = (20
CartesianIndex(-1, 0),21
CartesianIndex(1,0),22
CartesianIndex(0, 1),23
CartesianIndex(0, -1)24
)25
26
struct GridWorldModel <: AbstractEnvironmentModel27
end28
29
function (m::GridWorldModel)(s, a)30
nextstep(CartesianIndices((5,5))[s], ACTIONS[a])31
end32
33
RLBase.state_space(m::GridWorldModel) = Base.OneTo(5*5)34
RLBase.action_space(m::GridWorldModel) = Base.OneTo(length(ACTIONS))35
end0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
xxxxxxxxxx1
1
V = TabularVApproximator(;n_state=25, opt=Descent(1.0))3.30943
1.52199
0.0512075
-0.973217
-1.85733
8.78964
2.99266
0.738502
-0.435172
-1.34491
4.42795
2.25045
0.673411
-0.354592
-1.22898
5.32267
1.90786
0.358465
-0.585334
-1.42265
1.49249
0.54769
-0.40287
-1.18281
-1.97492
1.0
xxxxxxxxxx6
1
policy_evaluation!(2
V = V,3
π=RandomPolicy(Base.OneTo(4)),4
model=GridWorldModel(),5
γ=0.96
)5×5 Array{Float64,2}:
3.30943 8.78964 4.42795 5.32267 1.49249
1.52199 2.99266 2.25045 1.90786 0.54769
0.0512075 0.738502 0.673411 0.358465 -0.40287
-0.973217 -0.435172 -0.354592 -0.585334 -1.18281
-1.85733 -1.34491 -1.22898 -1.42265 -1.97492xxxxxxxxxx1
1
table = reshape(V.table, 5, 5)xxxxxxxxxx1
1
heatmap(table,yflip=true)