1. Tensorflow 2.X Ver#1
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
input_size = env.observation_space.n
output_size = env.action_space.n
dis=.99
num_episodes=2000
model = tf.keras.Sequential([
tf.keras.layers.Dense(output_size, input_shape=[input_size],
kernel_initializer=tf.random_uniform_initializer(minval=0, maxval=0.01))
])
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), loss='mse')
model.summary()
rList=[]
for i in range(num_episodes):
s = env.reset()
e = 1.0 / ((i/50)+10)
rAll = 0
done = False
while not done:
Qs = model.predict(one_hot(s))
if np.random.rand(1) < e:
a = env.action_space.sample()
else:
a = np.argmax(Qs)
s1, reward, done, _ = env.step(a)
if done:
Qs[0, a] = reward
else:
Qs1 = model.predict(one_hot(s1))
Qs[0, a] = reward + dis*np.max(Qs1)
model.fit(x=one_hot(s), y=Qs)
rAll += reward
s= s1
rList.append(rAll)
print("Percent of successful episode: "+str(sum(rList)/num_episodes)+"%")
plt.bar(range(len(rList)), rList, color='blue')
plt.show()
2. Tensorflow 2.X Ver2
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import gym
import matplotlib.pyplot as plt
def SumSqureError(y_true, y_pred):
return tf.reduce_sum( tf.square(y_true - y_pred) )
def my_init(shape, dtype=None):
return np.random.uniform(0, 0.01, shape)
env = gym.make("FrozenLake-v1")
INPUT_ONE_HOT = np.identity(env.observation_space.n)
model = keras.Sequential()
model.add(layers.Dense(env.action_space.n, input_shape=(None, env.observation_space.n), kernel_initializer=my_init))
model.summary()
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1), loss=SumSqureError, metrics=['accuracy'])
dis = 0.99
episodes = 2000
rList = []
def one_hot(x):
return INPUT_ONE_HOT[x:x + 1]
rAll = 0
for i in range(episodes):
state = env.reset()
e = 1.0 / ((i / 50) + 10)
Done = False
while not Done :
Qs = model.predict(one_hot(state))
if np.random.rand(1) < e:
action = env.action_space.sample()
else:
action = np.argmax( Qs )
new_state, reward, Done, _ = env.step(action)
#env.render()
if Done:
rList.append(reward)
rAll += reward
Qs[0,action] = reward
else :
Qs[0,action] = reward + dis * np.max(model.predict(one_hot(new_state)))
model.fit(one_hot(state), Qs)
state = new_state
if (i % 200 == 0) and (i > 0):
print("Prgress : ", (i/episodes), " ==> ", (rAll / i))
print("Success rate: " + str(rAll / episodes))
plt.bar(range(len(rList)), rList, color="blue")
plt.show()
env.close()
2. Tensorflow 1.X
import tensorflow.compat.v1 as tf
import numpy as np
import gym
import matplotlib.pyplot as plt
tf.disable_v2_behavior()
env = gym.make('FrozenLake-v1')
input_size = env.observation_space.n
output_size = env.action_space.n
episodes = 2000
rAll = 0
rList = []
dis = 0.99
Done = False
ONE_HOT = np.identity(input_size)
X = tf.placeholder(shape=[1, input_size], dtype=tf.float32)
#W = tf.Variable(tf.random_normal([input_size, output_size]))
W = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32)
Qpred = tf.matmul(X, W)
cost = tf.reduce_sum(tf.square(Y - Qpred))
train = tf.train.GradientDescentOptimizer(learning_rate = 0.1).minimize(cost)
sess = tf.Session();
sess.run(tf.global_variables_initializer())
def one_hot(x):
return ONE_HOT[x:x + 1]
for i in range(episodes):
state = env.reset()
Done = False
#e = 1. / ((i / 50) + 10)
e = 1. / ((i / 50) + 10)
while not Done :
Qs = sess.run(Qpred, feed_dict={X: one_hot(state)})
if np.random.rand(1) < e:
action = env.action_space.sample()
else:
action = np.argmax(Qs)
new_state, reward, Done, _ = env.step(action)
#env.render()
if Done:
rAll += reward
rList.append(reward)
# Update Q, and no Qs+1, since it's a terminal state
Qs[0, action] = reward
else:
# Obtain the Q_s1 values by feeding the new state through our
# network
Qs1 = sess.run(Qpred, feed_dict={X: one_hot(new_state)})
# Update Q
Qs[0, action] = reward + dis * np.max(Qs1)
sess.run(train, feed_dict={X: one_hot(state), Y: Qs})
state = new_state
if (i % 200 == 0) and (i > 0):
print("Prgress : ", (i/episodes), " ==> ", (rAll / i))
print("Percent of success of episodes : ", (rAll / episodes))
plt.bar(range(len(rList)), rList, color="blue")
plt.show()
env.close()