Chapter 6.7 Maximization Bias and Double Learning
In example 6.7, authors introduced a MDP problem to compare the different performance of Q-Learning and Double-Q-Learning. This environment is kind of special compared to the environments we have seen before. In the first step, only LEFT and RIGHT are allowed. In the second step, if the LEFT is chosen previously, then we have 10 valid actions. We call this kind of environment is of FULL_ACTION_SET.
xxxxxxxxxx6
1
begin2
using ReinforcementLearning3
using Statistics4
using Flux5
using Plots6
endMaximizationBiasEnvxxxxxxxxxx13
1
"""2
states:3
1:A4
2:B5
3:terminal6
actions:7
1: left8
2: right9
"""10
Base. mutable struct MaximizationBiasEnv <: AbstractEnv11
position::Int = 112
reward::Float64 = 0.013
endxxxxxxxxxx1
1
RLBase.state_space(env::MaximizationBiasEnv) = Base.OneTo(3)xxxxxxxxxx1
1
RLBase.action_space(env::MaximizationBiasEnv) = Base.OneTo(10)xxxxxxxxxx1
1
RLBase.ActionStyle(env::MaximizationBiasEnv) = FULL_ACTION_SET1xxxxxxxxxx1
1
const LEFT = 12xxxxxxxxxx1
1
const RIGHT = 2xxxxxxxxxx7
1
function RLBase.legal_action_space(env::MaximizationBiasEnv)2
if env.position == 13
(LEFT, RIGHT)4
else5
Base.OneTo(10)6
end7
endxxxxxxxxxx10
1
function RLBase.legal_action_space_mask(env::MaximizationBiasEnv)2
m = fill(false, 10)3
if env.position == 14
m[LEFT] = true5
m[RIGHT] = true6
else7
m .= true8
end9
m10
endxxxxxxxxxx15
1
function (env::MaximizationBiasEnv)(a::Int)2
if env.position == 13
if a == 14
env.position = 25
env.reward = 0.06
else7
env.position = 38
env.reward = 0.09
end10
elseif env.position == 211
env.position = 312
env.reward = randn() - 0.113
end14
nothing15
endxxxxxxxxxx5
1
function RLBase.reset!(env::MaximizationBiasEnv)2
env.position = 13
env.reward = 0.04
nothing5
endxxxxxxxxxx1
1
RLBase.reward(env::MaximizationBiasEnv) = env.rewardxxxxxxxxxx1
1
RLBase.is_terminated(env::MaximizationBiasEnv) = env.position == 3xxxxxxxxxx1
1
RLBase.state(env::MaximizationBiasEnv) = env.positionNow the environment is well defined.
# MaximizationBiasEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.FullActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(3)`
## Action Space
`Base.OneTo(10)`
## Current State
```
1
```
xxxxxxxxxx1
1
world = MaximizationBiasEnv()3
10
xxxxxxxxxx1
1
NS, NA = length(state_space(world)), length(action_space(world))To calculate the percentage of chosing LEFT action in the first step, we'll create customized hook here:
CountOfLeftxxxxxxxxxx3
1
Base. mutable struct CountOfLeft <: AbstractHook2
counts::Vector{Bool} = []3
endxxxxxxxxxx5
1
function (f::CountOfLeft)(::PreActStage, agent, env, action)2
if state(env) == 13
push!(f.counts, action == LEFT)4
end5
endNext we create two agent factories, Q-Learning and Double-Q-Learning.
create_double_Q_agent (generic function with 1 method)xxxxxxxxxx28
1
create_double_Q_agent() = Agent(2
policy=QBasedPolicy(3
learner=DoubleLearner(4
L1=TDLearner(5
approximator=TabularQApproximator(6
n_state=NS,7
n_action=NA,8
opt=Descent(0.1),9
),10
method=:SARS,11
γ=1.,12
n=0,13
),14
L2=TDLearner(15
approximator=TabularQApproximator(;16
n_state=NS,17
n_action=NA,18
opt=Descent(0.1),19
),20
method=:SARS,21
γ=1.,22
n=0,23
),24
),25
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)26
),27
trajectory=VectorSARTTrajectory()28
)create_Q_agent (generic function with 1 method)xxxxxxxxxx12
1
create_Q_agent() = Agent(2
policy=QBasedPolicy(3
learner=TDLearner(4
approximator=TabularQApproximator(;n_state=NS, n_action=NA, opt=Descent(0.1)),5
method=:SARS,6
γ=1.0,7
n=08
),9
explorer=EpsilonGreedyExplorer(0.1;is_break_tie=true)10
),11
trajectory=VectorSARTTrajectory()12
)xxxxxxxxxx18
1
begin2
DQ_stats = []3
for _ in 1:10004
hook = CountOfLeft()5
run(create_double_Q_agent(), world, StopAfterEpisode(300),hook)6
push!(DQ_stats, hook.counts)7
end8
plot(mean(DQ_stats), legend=:topright, label="double q")9
10
Q_stats = []11
for _ in 1:100012
hook = CountOfLeft()13
run(create_Q_agent(), world, StopAfterEpisode(300),hook)14
push!(Q_stats, hook.counts)15
end16
plot!(mean(Q_stats), legend=:topright, label="q")17
hline!([0.05], linestyle=:dash, label="optimal")18
end