Chapter 10 Mountain Car
xxxxxxxxxx7
1
begin2
using ReinforcementLearning3
using Flux4
using Statistics5
using Plots6
using SparseArrays7
endThe MountainCarEnv is already provided in ReinforcementLearning.jl. So we can use it directly here. Note that by default this environment will terminate at the maximum step of 200. While in the example on the book there's no such restriction.
# MountainCarEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`ReinforcementLearningBase.Space{Array{IntervalSets.Interval{:closed,:closed,Float64},1}}(IntervalSets.Interval{:closed,:closed,Float64}[-1.2..0.6, -0.07..0.07])`
## Action Space
`Base.OneTo(3)`
## Current State
```
[-0.5834095103051787, 0.0]
```
xxxxxxxxxx1
1
env = MountainCarEnv()-1.2..0.6
-0.07..0.07
xxxxxxxxxx1
1
S = state_space(env)First let's define a Tiling structure to encode the state.
encode (generic function with 2 methods)xxxxxxxxxx21
1
begin2
struct Tiling{N,Tr<:AbstractRange}3
ranges::NTuple{N,Tr}4
inds::LinearIndices{N,NTuple{N,Base.OneTo{Int}}}5
end6
7
Tiling(ranges::AbstractRange...) =Tiling(8
ranges,9
LinearIndices(Tuple(length(r) - 1 for r in ranges))10
)11
12
Base.length(t::Tiling) = reduce(*, (length(r) - 1 for r in t.ranges))13
14
function Base.:-(t::Tiling, xs)15
Tiling((r .- x for (r, x) in zip(t.ranges, xs))...)16
end17
18
encode(range::AbstractRange, x) = floor(Int, div(x - range[1], step(range)) + 1)19
20
encode(t::Tiling, xs) = t.inds[CartesianIndex(Tuple(map(encode, t.ranges, xs)))]21
end-1.2:0.22499999999999998:0.825
-0.07:0.0175:0.0875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.228125:0.22499999999999998:0.796875
-0.0721875:0.0175:0.0853125
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.2562499999999999:0.22499999999999998:0.7687499999999999
-0.074375:0.0175:0.083125
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.284375:0.22499999999999998:0.740625
-0.0765625:0.0175:0.0809375
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.3125:0.22499999999999998:0.7124999999999999
-0.07875:0.0175:0.07875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.340625:0.22499999999999998:0.684375
-0.0809375:0.0175:0.0765625
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.36875:0.22499999999999998:0.65625
-0.083125:0.0175:0.074375
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81-1.3968749999999999:0.22499999999999998:0.628125
-0.0853125:0.0175:0.0721875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}:
1 10 19 28 37 46 55 64 73
2 11 20 29 38 47 56 65 74
3 12 21 30 39 48 57 66 75
4 13 22 31 40 49 58 67 76
5 14 23 32 41 50 59 68 77
6 15 24 33 42 51 60 69 78
7 16 25 34 43 52 61 70 79
8 17 26 35 44 53 62 71 80
9 18 27 36 45 54 63 72 81xxxxxxxxxx12
1
begin2
ntilings = 83
ntiles = 84
tiling = Tiling(5
(6
range(r.left, step=(r.right-r.left)/ntiles, length=ntiles+2)7
for r in S8
)...9
)10
offset = map(x-> x.right - x.left, S) ./ (ntiles * ntilings)11
tilings = [tiling - offset .* (i-1) for i in 1:ntilings]12
endThe rest parts are simple, we initialize agent and env, then roll out experiments:
create_env_agent (generic function with 3 methods)xxxxxxxxxx24
1
function create_env_agent(α=2e-4, n=0)2
env = StateOverriddenEnv(3
MountainCarEnv(;max_steps=10000),4
s -> sparse(map(t -> encode(t, s), tilings), 1:8, ones(8), 81, 8) |> vec5
)6
7
agent = Agent(8
policy=QBasedPolicy(9
learner=TDLearner(10
approximator=LinearQApproximator(11
n_state=81*8,12
n_action=3,13
opt = Descent(α)14
),15
method=:SARSA,16
n=n17
),18
explorer=GreedyExplorer()19
),20
trajectory=VectorSARTTrajectory(;state=Vector{Int})21
)22
23
env, agent24
end-1.2:0.046153846153846156:0.6xxxxxxxxxx1
1
X = range(S[1].left, stop=S[1].right, length=40)-0.07:0.0035897435897435897:0.07xxxxxxxxxx1
1
Y = range(S[2].left, stop=S[2].right, length=40)show_approximation (generic function with 1 method)xxxxxxxxxx8
1
function show_approximation(n)2
env, agent = create_env_agent()3
run(agent, env, StopAfterEpisode(n))4
[5
agent.policy.learner.approximator(env.f([p, v])) |> maximum6
for p in X, v in Y7
]8
end10xxxxxxxxxx1
1
n = 10xxxxxxxxxx1
1
plot(X, Y, -show_approximation(n), linetype=:wireframe)xxxxxxxxxx15
1
begin2
fig_10_2 = plot(legend=:topright)3
n_runs = 5 # quite slow here, need revisit4
for α in [0.1/8, 0.2/8, 0.5/8]5
avg_steps_per_episode = zeros(500)6
for _ in 1:n_runs7
env, agent = create_env_agent(α)8
hook = StepsPerEpisode()9
run(agent, env, StopAfterEpisode(500; is_show_progress=false),hook)10
avg_steps_per_episode .+= hook.steps11
end12
plot!(fig_10_2, avg_steps_per_episode ./ n_runs, yscale=:log10, label="α=$α")13
end14
fig_10_215
endxxxxxxxxxx12
1
begin2
function run_once(α, n;is_reduce=true, n_episode=50)3
env, agent = create_env_agent(α, n)4
hook = StepsPerEpisode()5
run(agent, env, StopAfterEpisode(n_episode; is_show_progress=false),hook)6
is_reduce ? mean(hook.steps) : hook.steps7
end8
fig_10_3 = plot()9
plot!(fig_10_3, mean(run_once(0.5/8, 1; is_reduce=false, n_episode=500) for _ in 1:10), yscale=:log10)10
plot!(fig_10_3, mean(run_once(0.3/8, 8; is_reduce=false, n_episode=500) for _ in 1:10), yscale=:log10)11
fig_10_312
endxxxxxxxxxx7
1
begin2
fig_10_4 = plot(legend=:topright)3
for (A, n) in [(0.4:0.1:1.7, 1), (0.3:0.1:1.6, 2), (0.2:0.1:1.4, 4), (0.2:0.1:0.9, 8), (0.2:0.1:0.7, 16)]4
plot!(fig_10_4, A, [mean(run_once(α/8, n) for _ in 1:5) for α in A], label="n = $n")5
end6
fig_10_47
end