Q_lambda/Q_lambda.py at master · IgnacioCarlucho/Q_lambda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gym
import numpy as np
import time

np.random.seed(1234)

# If vanilla_Q is true the algorithm implements the vanilla q lambda algorithm described in sutton
# otherise the watkins version is implemented
vanilla_Q = False


# Generate a frozen lake environment without skid
# this is easier to solve for a tabular RL
from gym.envs.registration import register, spec
#env = gym.make('FrozenLake-v0')
MY_ENV_NAME='FrozenLakeNonskid8x8-v0'
try:
    spec(MY_ENV_NAME)
except:
    register(
        id=MY_ENV_NAME,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False},
        timestep_limit=100,
        reward_threshold=0.8196, # optimum = .8196
    )
env = gym.make(MY_ENV_NAME)
# Constants
num_action = env.action_space.n
num_states = env.observation_space.n
epochs = 3000
gamma = 0.9 # discount factor
value_lambda = 1.1
alpha = 0.25 # learning rate


# variables
test = 0
for i in range(11):
    test = test + 1
    value_lambda = value_lambda - 0.1
    epsilon = 0.7
    wins = 0
    losses = 0
    Q_table = np.zeros((num_states,num_action))
    E_table = np.zeros((num_states,num_action))

    for i in range(epochs):
        #print(i,step ,round(epsilon,3), wins, 'last rewrd', reward )
        state = env.reset()
        done = False
        reward = 0.
        epsilon = np.maximum(epsilon - 0.0007, 0.1)
        step = 0
        while not done:
            step = step +1

            if (np.random.rand() < epsilon): #choose random action
                action = np.random.randint(0,num_action)
                flag_max = False
            else: #choose best action from Q(s,a) values
                action = np.argmax(Q_table[state])
                flag_max = True

            new_state, reward, done, info = env.step(action)
            #print(i, step, new_state, reward)

            if not done: # Non-terminal state.
               next_q = np.max(Q_table[new_state])
               target = reward + ( gamma * next_q ) - Q_table[state][action]
            else:
                if reward == 1.:
                    target = reward + 9.
                else:
                	target = reward

            E_table[state][action] = E_table[state][action] + 1.

            for y in range(num_action):
                for x in range(num_states):
                    Q_table[x][y] = Q_table[x][y] + (alpha*target*E_table[x][y])
                    if vanilla_Q:
                        flag_max = True
                    if flag_max:
                        E_table[x][y] = gamma*value_lambda*E_table[x][y]
                    else:
                        E_table[x][y] = 0.

            state = new_state
            if done:
                if new_state == 15:
                    wins = wins +1
                else:
                	losses = losses +1


    print('test', test,'lambda', round(value_lambda,3) ,'wins', wins, 'losses', losses, 'efficiency', round(100.*wins/(losses+wins),2))