Example 6.5: Windy Gridworld
First, let's define this environment by implementing the interfaces defined in RLBase.
xxxxxxxxxx38
1
begin2
using ReinforcementLearning3
using Flux4
5
const NX = 76
const NY = 107
const Wind = [CartesianIndex(w, 0) for w in [0, 0, 0, -1, -1, -1, -2, -2, -1, 0]]8
const StartPosition = CartesianIndex(4, 1)9
const Goal = CartesianIndex(4, 8)10
const ACTION = [11
CartesianIndex(0, -1), # left12
CartesianIndex(0, 1), # right13
CartesianIndex(-1, 0), # up14
CartesianIndex(1, 0), # down15
]16
17
const LinearInds = LinearIndices((NX, NY))18
19
Base. mutable struct WindyGridWorldEnv <: AbstractEnv20
position::CartesianIndex{2} = StartPosition21
end22
23
RLBase.state_space(env::WindyGridWorldEnv) = Base.OneTo(length(LinearInds))24
RLBase.action_space(env::WindyGridWorldEnv) = Base.OneTo(length(ACTION))25
26
function (env::WindyGridWorldEnv)(a::Int)27
p = env.position + Wind[env.position[2]] + ACTION[a]28
p = CartesianIndex(min(max(p[1], 1), NX), min(max(p[2], 1), NY))29
env.position = p30
nothing31
end32
33
RLBase.state(env::WindyGridWorldEnv) = LinearInds[env.position]34
RLBase.is_terminated(env::WindyGridWorldEnv) = env.position == Goal35
RLBase.reward(env::WindyGridWorldEnv) = env.position == Goal ? 0.0 : -1.036
37
RLBase.reset!(env::WindyGridWorldEnv) = env.position = StartPosition38
end# WindyGridWorldEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(70)`
## Action Space
`Base.OneTo(4)`
## Current State
```
4
```
xxxxxxxxxx1
1
world = WindyGridWorldEnv()Agent
├─ policy => QBasedPolicy
│ ├─ learner => TDLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 4×70 Array{Float64,2}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 0.5
│ │ ├─ γ => 1.0
│ │ ├─ method => SARSA
│ │ └─ n => 0
│ └─ explorer => EpsilonGreedyExplorer
│ ├─ ϵ_stable => 0.1
│ ├─ ϵ_init => 1.0
│ ├─ warmup_steps => 0
│ ├─ decay_steps => 0
│ ├─ step => 1
│ ├─ rng => Random._GLOBAL_RNG
│ └─ is_training => true
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx14
1
agent = Agent(2
policy=QBasedPolicy(3
learner=TDLearner(4
approximator=TabularQApproximator(5
;n_state=length(state_space(world)),6
n_action=length(action_space(world)),7
opt=Descent(0.5)8
),9
method=:SARSA10
),11
explorer=EpsilonGreedyExplorer(0.1)12
),13
trajectory=VectorSARTTrajectory()14
)0
xxxxxxxxxx1
1
hook = StepsPerEpisode()531
383
353
235
116
60
67
109
57
26
99
65
79
52
56
24
99
52
164
39
22
17
17
16
16
18
16
20
19
26
7
xxxxxxxxxx1
1
run(agent, world, StopAfterStep(8000),hook)xxxxxxxxxx1
1
using Plotsxxxxxxxxxx1
1
plot([i for (i, x) in enumerate(hook.steps) for _ in 1:x])