Chapter 6.2 Random Walk
xxxxxxxxxx6
1
begin2
using ReinforcementLearning3
using Flux4
using Statistics5
using Plots6
endIn this section, we'll use the RandomWalk1D env provided in ReinforcementLearning.
# RandomWalk1D
## Traits
| Trait Type | Value |
|:----------------- | ----------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.PerfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Deterministic() |
| RewardStyle | ReinforcementLearningBase.TerminalReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Int64}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Int64}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(7)`
## Action Space
`Base.OneTo(2)`
## Current State
```
4
```
xxxxxxxxxx1
1
env = RandomWalk1D(;rewards=0.0=>1.0)7
2
xxxxxxxxxx1
1
NS, NA = length(state_space(env)), length(action_space(env))As is explained in the book, the true values of state A to E are:
0.166667
0.333333
0.5
0.666667
0.833333
xxxxxxxxxx1
1
true_values = [i/6 for i in 1:5]To estimate the state values, we'll use the VBasedPolicy with a random action generator.
create_TD_agent (generic function with 1 method)xxxxxxxxxx12
1
create_TD_agent(α) = Agent(2
policy=VBasedPolicy(3
learner = TDLearner(4
approximator=TabularApproximator(fill(0.5, NS), Descent(α)),5
method=:SRS,6
γ=1.0,7
n=0,8
),9
mapping = (env, V) -> rand(1:NA)10
),11
trajectory=VectorSARTTrajectory()12
)xxxxxxxxxx14
1
begin2
p_6_2_left = plot(;legend=:bottomright)3
for i in [1, 10, 100]4
agent = create_TD_agent(0.1)5
run(agent, env, StopAfterEpisode(i))6
plot!(7
p_6_2_left,8
agent.policy.learner.approximator.table[2:end - 1],9
label="episode = $i"10
)11
end12
plot!(p_6_2_left, true_values, label="true value")13
p_6_2_left14
endTo calculate the RMS error, we need to define such a hook first.
RecordRMSxxxxxxxxxx3
1
Base. struct RecordRMS <: AbstractHook2
rms::Vector{Float64} = []3
endxxxxxxxxxx4
1
(f::RecordRMS)(::PostEpisodeStage, agent, env) = push!(2
f.rms,3
sqrt(mean((agent.policy.learner.approximator.table[2:end - 1] - true_values).^2))4
)Now let's take a look at the performance of TDLearner under different α.
xxxxxxxxxx15
1
begin 2
p_6_2_right = plot()3
4
for α in [0.05, 0.1, 0.15]5
rms = []6
for _ in 1:1007
agent = create_TD_agent(α)8
hook = RecordRMS()9
run(agent, env, StopAfterEpisode(100),hook)10
push!(rms, hook.rms)11
end12
plot!(p_6_2_right, mean(rms), label ="TD alpha=$α", linestyle=:dashdot)13
end14
p_6_2_right15
endThen we can compare the differences between TDLearner and MonteCarloLearner.
create_MC_agent (generic function with 1 method)xxxxxxxxxx10
1
create_MC_agent(α) = Agent(2
policy=VBasedPolicy(3
learner=MonteCarloLearner(4
approximator=TabularApproximator(fill(0.5, NS), Descent(α)),5
kind=EVERY_VISIT6
),7
mapping = (env, V) -> rand(1:NA)8
),9
trajectory=VectorSARTTrajectory()10
)xxxxxxxxxx10
1
for α in [0.01, 0.02, 0.03, 0.04]2
rms = []3
for _ in 1:1004
agent = create_MC_agent(α)5
hook = RecordRMS()6
run(agent, env, StopAfterEpisode(100),hook)7
push!(rms, hook.rms)8
end9
plot!(p_6_2_right, mean(rms), label ="MC alpha=$α")10
endxxxxxxxxxx1
1
p_6_2_rightxxxxxxxxxx24
1
begin2
fig_6_2 = plot()3
4
rms = []5
for _ in 1:1006
agent = create_TD_agent(0.1)7
hook = RecordRMS()8
run(agent, env, StopAfterEpisode(100),hook)9
push!(rms, hook.rms)10
end11
plot!(fig_6_2, mean(rms), label ="TD alpha=0.1", linestyle=:dashdot)12
13
14
rms = []15
for _ in 1:10016
agent = create_MC_agent(0.1)17
hook = RecordRMS()18
run(agent, env, StopAfterEpisode(100),hook)19
push!(rms, hook.rms)20
end21
plot!(fig_6_2, mean(rms), label ="MC alpha=0.1")22
23
fig_6_224
endWarning
Some of you might have noticed that the above figure is not the same with the one on the book of Figure 6.2. Actually we are not doing **BATCH TRAINING** here, because we're emptying the `trajectory` at the end of each episode. We leave it as an exercise for readers to practice developing new customized algorithms with `ReinforcementLearning.jl`. 😉