1. Tensorflow 2.X Ver#1
import gym import numpy as np import tensorflow as tf import matplotlib.pyplot as plt env = gym.make('FrozenLake-v0') learning_rate = 0.1 input_size = env.observation_space.n output_size = env.action_space.n dis=.99 num_episodes=2000 model = tf.keras.Sequential([ tf.keras.layers.Dense(output_size, input_shape=[input_size], kernel_initializer=tf.random_uniform_initializer(minval=0, maxval=0.01)) ]) model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), loss='mse') model.summary() rList=[] for i in range(num_episodes): s = env.reset() e = 1.0 / ((i/50)+10) rAll = 0 done = False while not done: Qs = model.predict(one_hot(s)) if np.random.rand(1) < e: a = env.action_space.sample() else: a = np.argmax(Qs) s1, reward, done, _ = env.step(a) if done: Qs[0, a] = reward else: Qs1 = model.predict(one_hot(s1)) Qs[0, a] = reward + dis*np.max(Qs1) model.fit(x=one_hot(s), y=Qs) rAll += reward s= s1 rList.append(rAll) print("Percent of successful episode: "+str(sum(rList)/num_episodes)+"%") plt.bar(range(len(rList)), rList, color='blue') plt.show()
2. Tensorflow 2.X Ver2
import tensorflow as tf import numpy as np from tensorflow import keras from tensorflow.keras import layers import gym import matplotlib.pyplot as plt def SumSqureError(y_true, y_pred): return tf.reduce_sum( tf.square(y_true - y_pred) ) def my_init(shape, dtype=None): return np.random.uniform(0, 0.01, shape) env = gym.make("FrozenLake-v1") INPUT_ONE_HOT = np.identity(env.observation_space.n) model = keras.Sequential() model.add(layers.Dense(env.action_space.n, input_shape=(None, env.observation_space.n), kernel_initializer=my_init)) model.summary() model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1), loss=SumSqureError, metrics=['accuracy']) dis = 0.99 episodes = 2000 rList = [] def one_hot(x): return INPUT_ONE_HOT[x:x + 1] rAll = 0 for i in range(episodes): state = env.reset() e = 1.0 / ((i / 50) + 10) Done = False while not Done : Qs = model.predict(one_hot(state)) if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax( Qs ) new_state, reward, Done, _ = env.step(action) #env.render() if Done: rList.append(reward) rAll += reward Qs[0,action] = reward else : Qs[0,action] = reward + dis * np.max(model.predict(one_hot(new_state))) model.fit(one_hot(state), Qs) state = new_state if (i % 200 == 0) and (i > 0): print("Prgress : ", (i/episodes), " ==> ", (rAll / i)) print("Success rate: " + str(rAll / episodes)) plt.bar(range(len(rList)), rList, color="blue") plt.show() env.close()
2. Tensorflow 1.X
import tensorflow.compat.v1 as tf import numpy as np import gym import matplotlib.pyplot as plt tf.disable_v2_behavior() env = gym.make('FrozenLake-v1') input_size = env.observation_space.n output_size = env.action_space.n episodes = 2000 rAll = 0 rList = [] dis = 0.99 Done = False ONE_HOT = np.identity(input_size) X = tf.placeholder(shape=[1, input_size], dtype=tf.float32) #W = tf.Variable(tf.random_normal([input_size, output_size])) W = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01)) Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32) Qpred = tf.matmul(X, W) cost = tf.reduce_sum(tf.square(Y - Qpred)) train = tf.train.GradientDescentOptimizer(learning_rate = 0.1).minimize(cost) sess = tf.Session(); sess.run(tf.global_variables_initializer()) def one_hot(x): return ONE_HOT[x:x + 1] for i in range(episodes): state = env.reset() Done = False #e = 1. / ((i / 50) + 10) e = 1. / ((i / 50) + 10) while not Done : Qs = sess.run(Qpred, feed_dict={X: one_hot(state)}) if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(Qs) new_state, reward, Done, _ = env.step(action) #env.render() if Done: rAll += reward rList.append(reward) # Update Q, and no Qs+1, since it's a terminal state Qs[0, action] = reward else: # Obtain the Q_s1 values by feeding the new state through our # network Qs1 = sess.run(Qpred, feed_dict={X: one_hot(new_state)}) # Update Q Qs[0, action] = reward + dis * np.max(Qs1) sess.run(train, feed_dict={X: one_hot(state), Y: Qs}) state = new_state if (i % 200 == 0) and (i > 0): print("Prgress : ", (i/episodes), " ==> ", (rAll / i)) print("Percent of success of episodes : ", (rAll / episodes)) plt.bar(range(len(rList)), rList, color="blue") plt.show() env.close()
