1. Tensorflow 2.X Ver#1

import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make('FrozenLake-v0')


learning_rate = 0.1
input_size = env.observation_space.n
output_size = env.action_space.n
dis=.99
num_episodes=2000


model = tf.keras.Sequential([
    tf.keras.layers.Dense(output_size, input_shape=[input_size],
                         kernel_initializer=tf.random_uniform_initializer(minval=0, maxval=0.01))
])
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), loss='mse')
model.summary()


rList=[]
for i in range(num_episodes):
    s = env.reset()
    e = 1.0 / ((i/50)+10)
    rAll = 0
    done = False

    while not done:
        Qs = model.predict(one_hot(s))
        if np.random.rand(1) < e:
            a = env.action_space.sample()
        else:
            a = np.argmax(Qs)

        s1, reward, done, _ = env.step(a)
        if done:
            Qs[0, a] = reward
        else:
            Qs1 = model.predict(one_hot(s1))
            Qs[0, a] = reward + dis*np.max(Qs1)

        model.fit(x=one_hot(s), y=Qs)

        rAll += reward
        s= s1

    rList.append(rAll)

print("Percent of successful episode: "+str(sum(rList)/num_episodes)+"%")
plt.bar(range(len(rList)), rList, color='blue')
plt.show()

2. Tensorflow 2.X Ver2

import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import gym
import matplotlib.pyplot as plt



def SumSqureError(y_true, y_pred):
    return tf.reduce_sum( tf.square(y_true - y_pred) )


def my_init(shape, dtype=None):
    return np.random.uniform(0, 0.01, shape)



env = gym.make("FrozenLake-v1")
INPUT_ONE_HOT = np.identity(env.observation_space.n)

model = keras.Sequential()
model.add(layers.Dense(env.action_space.n, input_shape=(None, env.observation_space.n), kernel_initializer=my_init))
model.summary()
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1), loss=SumSqureError, metrics=['accuracy'])


dis = 0.99
episodes = 2000
rList = []


def one_hot(x):
    return INPUT_ONE_HOT[x:x + 1]


rAll = 0

for i in range(episodes):
    state = env.reset()

    e = 1.0 / ((i / 50) + 10)
    Done = False

    while not Done :
        Qs = model.predict(one_hot(state))
        
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax( Qs )

        new_state, reward, Done, _ = env.step(action)
        #env.render()


        if Done:
            rList.append(reward)
            rAll += reward
            Qs[0,action] = reward
        else :
            Qs[0,action] = reward + dis * np.max(model.predict(one_hot(new_state)))
 
        
        model.fit(one_hot(state), Qs)
        state = new_state

    if (i % 200 == 0) and (i > 0):
        print("Prgress : ", (i/episodes), "  ==> ", (rAll / i))



print("Success rate: " + str(rAll / episodes))
plt.bar(range(len(rList)), rList, color="blue")
plt.show()

env.close()

 

 

2. Tensorflow 1.X

import tensorflow.compat.v1 as tf
import numpy as np
import gym
import matplotlib.pyplot as plt

tf.disable_v2_behavior()




env = gym.make('FrozenLake-v1')
input_size = env.observation_space.n
output_size = env.action_space.n

episodes = 2000
rAll = 0
rList = []
dis = 0.99
Done = False
ONE_HOT = np.identity(input_size)


X = tf.placeholder(shape=[1, input_size], dtype=tf.float32)
#W = tf.Variable(tf.random_normal([input_size, output_size]))
W = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32)

Qpred = tf.matmul(X, W)
cost = tf.reduce_sum(tf.square(Y - Qpred))
train = tf.train.GradientDescentOptimizer(learning_rate = 0.1).minimize(cost)

sess = tf.Session();
sess.run(tf.global_variables_initializer())

def one_hot(x):
    return ONE_HOT[x:x + 1]

for i in range(episodes):
    
    state = env.reset()
    Done = False
    #e = 1. / ((i / 50) + 10)
    e =  1. / ((i / 50) + 10)
    
    while not Done :
        Qs = sess.run(Qpred, feed_dict={X: one_hot(state)})
        
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax(Qs)
            
        new_state, reward, Done, _ = env.step(action)
        #env.render()
        
        if Done:
            rAll += reward
            rList.append(reward)
            # Update Q, and no Qs+1, since it's a terminal state
            Qs[0, action] = reward
        else:
            # Obtain the Q_s1 values by feeding the new state through our
            # network
            Qs1 = sess.run(Qpred, feed_dict={X: one_hot(new_state)})
            # Update Q
            Qs[0, action] = reward + dis * np.max(Qs1)
            
        sess.run(train, feed_dict={X: one_hot(state), Y: Qs})
        state = new_state
    
    if (i % 200 == 0) and (i > 0):
        print("Prgress : ", (i/episodes), "  ==> ", (rAll / i))

print("Percent of success of episodes : ", (rAll / episodes))
plt.bar(range(len(rList)), rList, color="blue")
plt.show()

env.close()

+ Recent posts