Chapter 8.2 Dyna: Integrated Planning, Acting, and Learning
To demonstrate the flexibility of ReinforcementLearning.jl, the DynaAgent is also included and we'll explore its performance in this notebook.
xxxxxxxxxx1
1
using ReinforcementLearningThe Maze Environment
In this chapter, the authors introduced a specific maze environment. So let's define it by implementing the interfaces in ReinforcementLearning.jl.
CartesianIndex(0, -1)
CartesianIndex(0, 1)
CartesianIndex(-1, 0)
CartesianIndex(1, 0)
xxxxxxxxxx6
1
const LRUD = [2
CartesianIndex(0, -1), # left3
CartesianIndex(0, 1), # right4
CartesianIndex(-1, 0), # up5
CartesianIndex(1, 0), # down6
]xxxxxxxxxx31
1
begin2
mutable struct MazeEnv <: AbstractEnv3
walls::Set{CartesianIndex{2}}4
position::CartesianIndex{2}5
start::CartesianIndex{2}6
goal::CartesianIndex{2}7
NX::Int8
NY::Int9
end10
11
function MazeEnv()12
walls = Set([13
[CartesianIndex(i, 3) for i = 2:4]14
CartesianIndex(5, 6)15
[CartesianIndex(j, 8) for j = 1:3]16
])17
start = CartesianIndex(3, 1)18
goal = CartesianIndex(1, 9)19
MazeEnv(walls, start, start, goal, 6, 9)20
end21
22
function (env::MazeEnv)(a::Int)23
p = env.position + LRUD[a]24
if p == env.goal25
env.position = env.goal26
elseif !(p ∈ env.walls)27
env.position = CartesianIndex(min(max(p[1], 1), env.NX), min(max(p[2], 1), env.NY))28
end29
nothing30
end31
endxxxxxxxxxx1
1
RLBase.state_space(env::MazeEnv) = Base.OneTo(env.NX * env.NY)xxxxxxxxxx1
1
RLBase.action_space(env::MazeEnv) = Base.OneTo(length(LRUD))xxxxxxxxxx1
1
RLBase.reward(env::MazeEnv) = Float64(env.position == env.goal)xxxxxxxxxx1
1
RLBase.state(env::MazeEnv) = (env.position[2] - 1) * env.NX + env.position[1]xxxxxxxxxx1
1
RLBase.is_terminated(env::MazeEnv) = env.position == env.goalxxxxxxxxxx1
1
RLBase.reset!(env::MazeEnv) = env.position = env.start* (generic function with 861 methods)xxxxxxxxxx20
1
begin2
import Base: *3
4
function extend(p::CartesianIndex{2}, n::Int)5
x, y = Tuple(p)6
[CartesianIndex(n * (x - 1) + i, n * (y - 1) + j) for i = 1:n for j = 1:n]7
end8
9
function remap(p::CartesianIndex{2}, n::Int)10
x, y = Tuple(p)11
CartesianIndex((x - 1) * n + 1, (y - 1) * n + 1)12
end13
14
function *(env::MazeEnv, n::Int)15
walls = Set{CartesianIndex{2}}(ww for w in env.walls for ww in extend(w, n))16
start, position, goal = remap(env.start, n), remap(env.position, n), remap(env.goal, n)17
NX, NY = env.NX * n, env.NY * n18
MazeEnv(walls, position, start, goal, NX, NY)19
end20
end# MazeEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(54)`
## Action Space
`Base.OneTo(4)`
## Current State
```
3
```
xxxxxxxxxx1
1
x = MazeEnv()Figure 8.2
xxxxxxxxxx1
1
using Fluxplan_step (generic function with 1 method)xxxxxxxxxx27
1
function plan_step(n)2
env = MazeEnv()3
ns = length(state_space(env))4
na = length(action_space(env))5
6
agent = DynaAgent(7
policy=QBasedPolicy(8
learner=TDLearner(9
approximator=TabularQApproximator(10
n_state=ns,11
n_action=na, 12
opt=Descent(0.1)13
),14
γ=0.1,15
method=:SARS16
),17
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)18
),19
model=ExperienceBasedSamplingModel(),20
trajectory=VectorSARTTrajectory(),21
plan_step=n22
)23
24
hook = StepsPerEpisode()25
run(agent, env, StopAfterEpisode(50),hook)26
hook.steps27
endxxxxxxxxxx1
1
using Plotsxxxxxxxxxx1
1
using Statisticsxxxxxxxxxx7
1
begin2
fig_8_2 = plot(legend=:topright)3
for n in [0, 5, 50]4
plot!(fig_8_2, mean(plan_step(n) for _ in 1:30), label="plan_step = $n")5
end6
fig_8_27
endFigure 8.4
cumulative_dyna_reward (generic function with 1 method)xxxxxxxxxx45
1
function cumulative_dyna_reward(model, walls, nstep1, change, nstep2)2
env = MazeEnv(3
walls,4
CartesianIndex(6, 4),5
CartesianIndex(6, 4),6
CartesianIndex(1, 9),7
6,8
99
)10
ns = length(state_space(env))11
na = length(action_space(env))12
agent = DynaAgent(13
policy=QBasedPolicy(14
learner=TDLearner(15
approximator=TabularQApproximator(16
n_state=ns,17
n_action=na, 18
opt=Descent(1.)19
),20
γ=0.95,21
method=:SARS22
),23
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)24
),25
model=model,26
trajectory=VectorSARTTrajectory(),27
plan_step=1028
)29
30
hook = StepsPerEpisode()31
run(agent, env, StopAfterStep(nstep1;is_show_progress=false),hook)32
change(env.walls)33
run(agent, env, StopAfterStep(nstep2;is_show_progress=false),hook)34
35
cumulative_reward = []36
for (i, n) in enumerate(hook.steps)37
for _ in 1:n38
push!(cumulative_reward, i)39
end40
end41
for _ in (nstep1+nstep2):-1:length(cumulative_reward)42
push!(cumulative_reward, length(hook.steps))43
end44
cumulative_reward45
endwalls (generic function with 1 method)xxxxxxxxxx1
1
walls() = Set([CartesianIndex(4, j) for j in 1:8])change_walls (generic function with 1 method)xxxxxxxxxx4
1
function change_walls(walls)2
pop!(walls, CartesianIndex(4,1))3
push!(walls, CartesianIndex(4,9))4
endxxxxxxxxxx6
1
begin2
fig_8_4 = plot(legend=:topleft)3
plot!(fig_8_4, mean(cumulative_dyna_reward(ExperienceBasedSamplingModel(), walls(), 1000, change_walls, 2000) for _ in 1:30), label="Dyna-Q")4
plot!(fig_8_4, mean(cumulative_dyna_reward(TimeBasedSamplingModel(;n_actions=4), walls(), 1000, change_walls, 2000) for _ in 1:30), label="Dyna-Q+")5
fig_8_46
endFigure 8.5
new_walls (generic function with 1 method)xxxxxxxxxx1
1
new_walls() = Set([CartesianIndex(4, j) for j in 2:9])new_change_walls (generic function with 1 method)xxxxxxxxxx3
1
function new_change_walls(walls)2
pop!(walls, CartesianIndex(4,9))3
endxxxxxxxxxx6
1
begin2
fig_8_5 = plot(legend=:topleft)3
plot!(fig_8_5, mean(cumulative_dyna_reward(ExperienceBasedSamplingModel(), new_walls(), 3000, new_change_walls, 3000) for _ in 1:50), label="dyna-Q")4
plot!(fig_8_5, mean(cumulative_dyna_reward(TimeBasedSamplingModel(n_actions=4, κ = 1e-3), new_walls(), 3000, new_change_walls, 3000) for _ in 1:50), label="dyna-Q+")5
fig_8_56
endExample 8.4
run_once (generic function with 2 methods)xxxxxxxxxx24
1
function run_once(model, ratio=1)2
env = MazeEnv() * ratio3
ns = length(state_space(env))4
na = length(action_space(env))5
agent = DynaAgent(6
policy=QBasedPolicy(7
learner=TDLearner(8
approximator=TabularQApproximator(9
n_state=ns,10
n_action=na,11
opt=Descent(0.5)),12
γ=0.95,13
method=:SARS14
),15
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)16
),17
model=model,18
trajectory=VectorSARTTrajectory(),19
plan_step=520
)21
hook = StepsPerEpisode()22
run(agent, env, (args...) -> length(hook.steps) > 0 && hook.steps[end] <= 14 * ratio * 1.2,hook)23
model.sample_count24
endxxxxxxxxxx6
1
begin2
p = plot(legend=:topleft)3
plot!(mean([run_once(ExperienceBasedSamplingModel(), ratio) for ratio in 1:6] for _ in 1:5), label="Dyna", yscale=:log10)4
plot!(mean([run_once(PrioritizedSweepingSamplingModel(), ratio) for ratio in 1:6] for _ in 1:5), label="Prioritized", yscale=:log10)5
p6
end