Chapter 9 On-policy Prediction with Approximation
In this notebook, we'll focus on the linear approximation methods.
xxxxxxxxxx7
1
begin2
using ReinforcementLearning3
using Flux4
using Statistics5
using Plots6
using SparseArrays7
endFigure 9.1
We've discussed the RandomWalk1D environment before. In previous example, the state space is relatively small (1:7). Here we expand it into 1:1000 and see how the LinearVApproximator will work here.
-100
-99
-98
-97
-96
-95
-94
-93
-92
-91
-90
-89
-88
-87
-86
-85
-84
-83
-82
-81
91
92
93
94
95
96
97
98
99
100
xxxxxxxxxx1
1
ACTIONS = collect(Iterators.flatten((-100:-1, 1:100)))200xxxxxxxxxx1
1
NA = length(ACTIONS)1002xxxxxxxxxx1
1
NS = 1002First, let's roll out a large experiment to calculate the true state values of each state:
0.0
-0.902616
-0.90916
-0.90618
-0.906309
-0.914173
-0.89853
-0.911785
-0.882043
-0.892313
-0.899347
-0.890821
-0.885008
-0.878395
-0.893046
-0.874078
-0.89532
-0.885626
-0.878013
-0.883827
0.904209
0.890995
0.909652
0.901835
0.904345
0.903078
0.913111
0.917742
0.923114
0.0
xxxxxxxxxx15
1
TRUE_STATE_VALUES = begin2
env = RandomWalk1D(N=NS, actions=ACTIONS)3
agent = Agent(4
policy=VBasedPolicy(5
learner=TDLearner(6
approximator=TabularVApproximator(;n_state=NS,opt=Descent(0.01)),7
method=:SRS,8
),9
mapping=(env,V) -> rand(action_space(env))10
),11
trajectory=VectorSARTTrajectory()12
)13
run(agent, env, StopAfterEpisode(10^5))14
agent.policy.learner.approximator.table15
endxxxxxxxxxx1
1
plot(TRUE_STATE_VALUES[2:end-1])Next, we define a preprocessor to map adjacent states into groups.
10xxxxxxxxxx1
1
N_GROUPS = 10GroupMappingxxxxxxxxxx5
1
Base. struct GroupMapping2
n::Int3
n_groups::Int = N_GROUPS4
n_per_group::Int=div(n, N_GROUPS)5
endxxxxxxxxxx10
1
function (p::GroupMapping)(x::Int)2
if x == 13
res = 14
elseif x == p.n5
res = p.n_groups + 26
else7
res = div(x - 2, p.n_per_group) + 28
end9
res10
endxxxxxxxxxx1
1
plot([GroupMapping(;n=NS)(i) for i in 1:NS], legend=nothing)To count the frequency of each state, we need to write a hook.
xxxxxxxxxx4
1
struct CountStates <: AbstractHook2
counts::Vector{Int}3
CountStates(n) = new(zeros(Int, n))4
endxxxxxxxxxx1
1
(f::CountStates)(::PreActStage, agent, env, action) = f.counts[state(env.env)] += 1Now let's kickoff our experiment:
Agent
├─ policy => VBasedPolicy
│ ├─ learner => MonteCarloLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 12-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 2.0e-5
│ │ ├─ γ => 1.0
│ │ ├─ kind => ReinforcementLearningZoo.EveryVisit
│ │ └─ sampling => ReinforcementLearningZoo.NoSampling
│ └─ mapping => Main.var"#3#4"
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx10
1
agent_1 = Agent(2
policy=VBasedPolicy(3
learner=MonteCarloLearner(4
approximator=TabularVApproximator(n_state=N_GROUPS+2,opt=Descent(2e-5)),5
kind=EVERY_VISIT, # this is very important!6
),7
mapping=(env,V) -> rand(action_space(env))8
),9
trajectory=VectorSARTTrajectory()10
)# RandomWalk1D |> StateOverriddenEnv
## Traits
| Trait Type | Value |
|:----------------- | ----------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.PerfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Deterministic() |
| RewardStyle | ReinforcementLearningBase.TerminalReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Int64}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Int64}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(1002)`
## Action Space
`Base.OneTo(200)`
## Current State
```
6
```
xxxxxxxxxx4
1
env_1 = StateOverriddenEnv(2
RandomWalk1D(N=NS, actions=ACTIONS),3
GroupMapping(n=NS)4
)0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
xxxxxxxxxx1
1
hook=CountStates(NS)0
1186
1198
1304
1358
1339
1274
1371
1339
1373
1332
1406
1439
1486
1465
1506
1538
1572
1600
1646
1374
1334
1262
1305
1254
1311
1292
1250
1193
0
xxxxxxxxxx1
1
run(agent_1, env_1, StopAfterEpisode(10^5),hook)xxxxxxxxxx7
1
begin2
fig_9_1 = plot(legend=:topleft)3
fig_9_1_right = twinx(fig_9_1)4
plot!(fig_9_1, hook.counts./sum(hook.counts), color=:gray, label="state distribution")5
plot!(fig_9_1_right, agent_1.policy.learner.approximator.(env_1.f(s) for s in 2:NS-1), label="MC Learner", legend=:bottomright)6
plot!(fig_9_1_right, TRUE_STATE_VALUES[2:end-1], label="true values",legend=:bottomright)7
endFigure 9.2
Agent
├─ policy => VBasedPolicy
│ ├─ learner => TDLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 12-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 0.0002
│ │ ├─ γ => 1.0
│ │ ├─ method => SRS
│ │ └─ n => 0
│ └─ mapping => Main.var"#5#6"
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx10
1
agent_2 = Agent(2
policy=VBasedPolicy(3
learner=TDLearner(4
approximator=TabularVApproximator(n_state=N_GROUPS+2,opt=Descent(2e-4)),5
method=:SRS,6
),7
mapping=(env,V) -> rand(action_space(env))8
),9
trajectory=VectorSARTTrajectory()10
)xxxxxxxxxx1
1
run(agent_2, env_1, StopAfterEpisode(10^5))xxxxxxxxxx6
1
begin2
fig_9_2_left = plot(legend=:bottomright)3
plot!(fig_9_2_left, agent_2.policy.learner.approximator.(env_1.f(s) for s in 2:NS-1), label="TD Learner", legend=:bottomright)4
plot!(fig_9_2_left, TRUE_STATE_VALUES[2:end-1], label="true values",legend=:bottomright)5
fig_9_2_left6
endFigure 9.2 right
xxxxxxxxxx4
1
struct RecordRMS <: AbstractHook2
rms::Vector{Float64}3
RecordRMS() = new([])4
endxxxxxxxxxx3
1
function (f::RecordRMS)(::PostEpisodeStage, agent, env)2
push!(f.rms, sqrt(mean((agent.policy.learner.approximator.(env.f.(2:(NS-1))) - TRUE_STATE_VALUES[2:end-1]).^2)))3
end20xxxxxxxxxx1
1
n_groups = 20run_once (generic function with 1 method)xxxxxxxxxx24
1
function run_once(n, α)2
env = StateOverriddenEnv(3
RandomWalk1D(N=NS, actions=ACTIONS),4
GroupMapping(n=NS)5
)6
agent = Agent(7
policy=VBasedPolicy(8
learner=TDLearner(9
approximator=TabularVApproximator(;10
n_state=n_groups+2,11
opt=Descent(α)12
),13
method=:SRS,14
n=n15
),16
mapping=(env,V) -> rand(action_space(env))17
),18
trajectory=VectorSARTTrajectory()19
)20
21
hook = RecordRMS()22
run(agent, env, StopAfterEpisode(10),hook)23
mean(hook.rms)24
endxxxxxxxxxx15
1
begin2
A = [0., 0.03, 0.06, 0.1:0.1:1...]3
fig_9_2_right = plot(legend=:bottomright, ylim=[0.25,0.55])4
for n in [2^i for i in 0:9]5
plot!(6
fig_9_2_right,7
A,8
mean(9
[run_once(n, α) for α in A] 10
for _ in 1:10011
),12
label="n = $n")13
end14
fig_9_2_right15
endFigure 9.5
xxxxxxxxxx3
1
struct FourierPreprocessor2
order::Int3
endxxxxxxxxxx1
1
(fp::FourierPreprocessor)(s::Number) = [cos(i * π * s) for i = 0:fp.order]xxxxxxxxxx3
1
struct PolynomialPreprocessor2
order::Int3
endxxxxxxxxxx1
1
(pp::PolynomialPreprocessor)(s::Number) = [s^i for i = 0:pp.order]run_once_MC (generic function with 1 method)xxxxxxxxxx20
1
function run_once_MC(preprocessor, order, α)2
env = StateOverriddenEnv(3
RandomWalk1D(N=NS, actions=ACTIONS),4
preprocessor5
)6
agent = Agent(7
policy=VBasedPolicy(8
learner=MonteCarloLearner(9
approximator=RLZoo.LinearVApproximator(;n=order+1,opt=Descent(α)),10
kind=EVERY_VISIT,11
),12
mapping=(env,V) -> rand(1:NA)13
),14
trajectory=VectorSARTTrajectory(;state=Vector{Float64})15
)16
17
hook=RecordRMS()18
run(agent, env, StopAfterEpisode(5000;is_show_progress=false),hook)19
hook.rms20
endxxxxxxxxxx36
1
begin2
3
fig_9_5 = plot(legend=:topright)4
5
for order in [5, 10, 20]6
plot!(7
fig_9_5, 8
mean(9
run_once_MC(10
x -> FourierPreprocessor(order)(x/NS),11
order,12
0.0000513
)14
for _ in 1:515
),16
label="Fourier $order", 17
linestyle=:dash18
)19
20
plot!(21
fig_9_5, 22
mean(23
run_once_MC(24
x -> PolynomialPreprocessor(order)(x/NS),25
order,26
0.000127
)28
for _ in 1:529
),30
label="Polynomial $order", 31
linestyle=:solid32
)33
end34
35
fig_9_536
endFigure 9.10
Implementing the tile encoding in Julia is quite easy!😀
Tilingxxxxxxxxxx11
1
begin2
struct Tiling{N,Tr<:AbstractRange}3
ranges::NTuple{N,Tr}4
inds::LinearIndices{N,NTuple{N,Base.OneTo{Int}}}5
end6
7
Tiling(ranges...) =Tiling(8
ranges,9
LinearIndices(Tuple(length(r) - 1 for r in ranges))10
)11
endxxxxxxxxxx1
1
Base.length(t::Tiling) = reduce(*, (length(r) - 1 for r in t.ranges))encode (generic function with 1 method)xxxxxxxxxx1
1
encode(range::AbstractRange, x) = floor(Int, div(x - range[1], step(range)) + 1)encode (generic function with 2 methods)xxxxxxxxxx1
1
encode(t::Tiling, xs) = t.inds[CartesianIndex(Tuple(map(encode, t.ranges, xs)))]1:200:1201
1
2
3
4
5
6
xxxxxxxxxx1
1
t = Tiling(range(1, step=200, length=7)) 1:200:1201
1
2
3
4
5
6
-3:200:1197
1
2
3
4
5
6
-7:200:1193
1
2
3
4
5
6
-11:200:1189
1
2
3
4
5
6
-15:200:1185
1
2
3
4
5
6
-19:200:1181
1
2
3
4
5
6
-23:200:1177
1
2
3
4
5
6
-27:200:1173
1
2
3
4
5
6
-31:200:1169
1
2
3
4
5
6
-35:200:1165
1
2
3
4
5
6
-39:200:1161
1
2
3
4
5
6
-43:200:1157
1
2
3
4
5
6
-47:200:1153
1
2
3
4
5
6
-51:200:1149
1
2
3
4
5
6
-55:200:1145
1
2
3
4
5
6
-59:200:1141
1
2
3
4
5
6
-63:200:1137
1
2
3
4
5
6
-67:200:1133
1
2
3
4
5
6
-71:200:1129
1
2
3
4
5
6
-75:200:1125
1
2
3
4
5
6
-159:200:1041
1
2
3
4
5
6
-163:200:1037
1
2
3
4
5
6
-167:200:1033
1
2
3
4
5
6
-171:200:1029
1
2
3
4
5
6
-175:200:1025
1
2
3
4
5
6
-179:200:1021
1
2
3
4
5
6
-183:200:1017
1
2
3
4
5
6
-187:200:1013
1
2
3
4
5
6
-191:200:1009
1
2
3
4
5
6
-195:200:1005
1
2
3
4
5
6
xxxxxxxxxx1
1
tt = [Tiling((range(1-4*(i-1), step=200, length=7))) for i in 1:50]run_once_MC_tiling (generic function with 1 method)xxxxxxxxxx20
1
function run_once_MC_tiling(preprocessor, α, n)2
env = StateOverriddenEnv(3
RandomWalk1D(N=NS, actions=ACTIONS),4
preprocessor5
)6
agent = Agent(7
policy=VBasedPolicy(8
learner=MonteCarloLearner(9
approximator=RLZoo.LinearVApproximator(;n=n,opt=Descent(α)),10
kind=EVERY_VISIT,11
),12
mapping=(env,V) -> rand(1:NA)13
),14
trajectory=VectorSARTTrajectory(;state=Vector{Float64})15
)16
17
hook=RecordRMS()18
run(agent, env, StopAfterEpisode(10000;is_show_progress=true),hook)19
hook.rms20
endxxxxxxxxxx25
1
begin2
fig_9_10 = plot()3
4
plot!(5
fig_9_10,6
run_once_MC_tiling(7
x -> sparse([encode(t, x) for t in tt], 1:50, ones(50), 7, 50) |> vec,8
1e-4/50,9
7*5010
),11
label="50 tilings"12
)13
14
plot!(15
fig_9_10,16
run_once_MC_tiling(17
x -> Flux.onehot(encode(t, x), 1:7),18
1e-4,19
720
),21
label = "one tiling"22
)23
24
fig_9_1025
endFeel free to make a PR if you can improve the speed of generating this figure. ❤