1.Implementation of Q-Learning with bellman Equation in linear grid environment
import numpy as np
#define the environment(gridworld)
n_states = 5 #Number of states (0,1,2,3,4)
actions = [0,1] # Actions: 0=left , 1=right
rewards = [-1,-1,-1,-1,10]
goal_state =4 #Goal state
alpha = 0.1 #Learning rate
gamma = 0.9 #Discount Factor
epsilon = 0.1 #Exploration probability
#Initialize Q-table (n_states x actions)
q_table = np.zeros((n_states, len(actions)))
q_table
n_episodes = 1000
for episode in range(n_episodes):
state = 0 #Start state
while state != goal_state:
#choose action (epsilon-greedy)
if np.random.rand() < epsilon:
action = np.random.choice(actions)
else:
action = np.argmax(q_table[state])
#Take action and observe new state and reward
new_state = state + 1 if action == 1 else max(0, state-1)
reward = rewards[new_state]
#Update Q-value 3.06 using the Q-Learning formula
q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
state = new_state
q_table
state = 0
path = [state]
while state != goal_state:
action = np.argmax(q_table[state])
state = state+1 if action==1 else max(0,state-1)
path.append(state)
print("Policy is",path)
-------------------------------------------------------------------------------
2.Creating Q table and setting optimal path using Q-learning on 4x4 grid
import numpy as np
import random
grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
#Parameters
alpha = 0.1 #Learning rate
gamma = 0.9 #Discount factor
epsilon = 0.1 #Exploration rate
episodes = 1000
#Initialize Q-table
q_table = np.zeros((grid_size,grid_size,len(actions)))
q_table
#Helper function
def get_next_state(state,action):
r,c=state
dr,dc=action_map[action]
new_r,new_c=r+dr,c+dc
if 0<=new_r<grid_size and 0<=new_c<grid_size:
return (new_r,new_c)
return state # if out of bounds,stay in the same state
def get_reward(state):
return 10 if state==goal_state else 0
#Q-Learning Algo
for episode in range(episodes):
state=(0,0)
while state != goal_state:
if random.uniform(0,1)<epsilon:
action=random.choice(actions) # explore
else:
action=actions[np.argmax(q_table[state[0],state[1]])] # exploit
#take action
next_state=get_next_state(state,action)
reward=get_reward(next_state)
#update Q-value
action_index=actions.index(action)
best_next_q=np.max(q_table[next_state[0],next_state[1]])
q_table[state[0],state[1],action_index] += alpha *(
reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation
)
state=next_state
#print Q-table
print("Trained Q-table:")
print(q_table)
# Test the agent
state = (0,0)
path = [state]
while state != goal_state:
action = actions[np.argmax(q_table[state[0],state[1]])]
state = get_next_state(state,action)
path.append(state)
print("Optimal path:")
print(path)
--------------------------------------------------------------------------
3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach
import numpy as np
states = ['Start','Coin','Enemy']
actions = ['Go_Coin','Go_Enemy']
transition_rewards = {
'Start':{'Go_Coin':10,'Go_Enemy':-5},
'Coin':{},
'Enemy':{}
}
transition_probs = {
'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}},
'Coin':{},
'Enemy':{}
}
states,actions
transition_rewards
transition_probs
gamma = 0.9
theta = 0.001
value_function = {state:0 for state in states}
value_function
while True:
delta=0
new_value_function = value_function.copy()
for state in states:
if state in['Coin','Enemy']:
continue
state_values = []
for action in actions:
value=0
for next_state,prob in transition_probs[state][action].items():
reward = transition_rewards[state][action]
print(reward)
print(prob)
print(value_function[next_state])
value += prob*(reward + gamma * value_function[next_state])
print(value)
state_values.append(value)
print("s_v",state_values)
new_value_function[state]=max(state_values)
print("n_v",new_value_function)
delta=max(delta,abs(new_value_function[state]-value_function[state]))
value_function=new_value_function
print("v_f",value_function)
if delta<theta:
break
value_function
policy = {}
for state in states:
if state in ['Coin','Enemy']:
policy[state]='T'
continue
action_values={}
for action in actions:
value=0
for next_state,prob in transition_probs[state][action].items():
reward = transition_rewards[state][action]
value += prob * (reward + gamma * value_function[next_state])
action_values[action]= value
policy[state]=max(action_values,key=action_values.get)
print("Optimal value function:", value_function)
print("Optimal policy:",policy)
-----------------------------------------------------------------------------------------
4 Creating policy table for grid problem using MDP
import numpy as np
import random
grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
#Parameters
theta = 1e-4 #Learning rate
gamma = 0.9 #Discount factor
rewards = np.zeros((grid_size,grid_size))
rewards[goal_state]=10
#Helper function
def get_next_state(state,action):
r,c=state
dr,dc=action_map[action]
new_r,new_c=r+dr,c+dc
if 0<=new_r<grid_size and 0<=new_c<grid_size:
return (new_r,new_c)
return state # if out of bounds,stay in the same state
value_table = np.zeros((grid_size,grid_size))
value_table
while True:
delta=0
new_value_table=np.copy(value_table)
for r in range(grid_size):
for c in range(grid_size):
state=(r,c)
if state==goal_state:
continue # skip goal state
# calc the value of the state using the bellmans eqn
state_values=[]
for action in actions:
next_state=get_next_state(state,action)
reward=rewards[next_state]
state_values.append(reward+gamma*value_table[next_state])
# Update the value table
new_value_table[state]=max(state_values)
delta=max(delta,abs(new_value_table[state]-value_table[state]))
value_table=new_value_table
if delta<theta:
break
print("optimal value table:")
print(value_table)
--------------------------------------------------------------------------
5 Using Q-learning and MDP on grid problem
import numpy as np
import random
# Environment setup
grid_size = 4
goal_state = (3,3)
actions = ['up', 'down', 'left', 'right']
action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)} # Positional changes due to action
print(action_map)
# Parameters
alpha = 0.1 # learning factor
gamma = 0.9 # discount factor (for future rewards)
epsilon = 0.1 # exploration factor
episodes = 1000 # no of training episodes/iterations
theta = 1e-4 # Convergence threshold for value iteration
# Reward and table initialization
rewards = np.zeros((grid_size, grid_size))
rewards[goal_state] = 10
q_table = np.zeros((grid_size, grid_size, len(actions))) # (3D) 16 X 16
value_table = np.zeros((grid_size, grid_size)) # 4 X 4
# Helper functions
def get_next_state(state, action):
row, col = state
d_row, d_col = action_map[action]
new_row = row + d_row
new_col = col + d_col
if 0<=new_row<grid_size and 0<=new_col<grid_size:
return (new_row, new_col)
else:
return state # if out of bounds, stay in the same state
# Q-learning with Value Iteration Influence
for episode in range(episodes):
state = (0,0) # start state
while state != goal_state:
# choose action (epsilon-greedy)
if random.uniform(0,1) < epsilon:
action = random.choice(actions) # Explore
else:
action = actions[np.argmax(q_table[state[0], state[1]])] # Exploit
# Take action
next_state = get_next_state(state, action)
reward = rewards[next_state]
# Q-learning update ( Bellman equation influences here)
action_index = actions.index(action)
best_next_q = np.max(q_table[next_state[0], next_state[1]])
q_table[state[0], state[1], action_index] += alpha * (reward + gamma *
best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value
# Update Value Table using Bellman Equation
state_values = []
for a in actions:
next_s = get_next_state(state, a)
state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]])) # Bellman Equation using Q-value
value_table[state] = max(state_values)
# Move to next state
state = next_state
# Derive Policy from Q-learning
policy = np.zeros((grid_size, grid_size), dtype=str)
for r in range(grid_size):
for c in range(grid_size):
state = (r, c)
if state == goal_state:
policy[state] = 'G' # Goal
continue
action_index = np.argmax(q_table[state[0], state[1]])
policy[state] = actions[action_index][0].upper()
print('Q-learning Q-Table:')
print(q_table)
print('\n Value Table (Updated with Bellman Influence): ')
print(value_table)
print('\n Optimal Policy from Q-learning: ')
for row in policy:
print(' '.join(row))
---------------------------------------------------------------------------------------------
6.Demonstrate solving 3x3 Grid problem using SARSA Approach
import random
ROWS, COLS = 3,3
ACTIONS = ["UP","DOWN","LEFT","RIGHT"]
Q = {}
for r in range(ROWS):
for c in range(COLS):
Q[(r,c)] = {a: 0.0 for a in ACTIONS}
print(Q)
alpha = 0.1 # learning rate
gamma = 0.9 # discount facor
epsilon = 0.2 # exploration rate
episodes = 100
def choose_action(state):
if random.random() < epsilon:
return random.choice(ACTIONS)
return max(Q[state], key=Q[state].get)
def step(state,action):
r, c =state
if action == "UP" and r> 0:
r -= 1
elif action == "DOWN" and r< ROWS - 1:
r += 1
elif action == "LEFT" and c > 0:
c -= 1
elif action == "RIGHT" and c < COLS - 1:
c += 1
next_state = (r,c)
if next_state == (0,2):
return next_state,10, True
else:
return next_state, -1, False
for ep in range(episodes):
state = (0,0)
action = choose_action(state)
done = False
while not done:
next_state, reward , done = step(state, action)
if not done:
next_action = choose_action(next_state)
next_q = Q[next_state][next_action]
else:
next_action = None
next_q = 0
Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action])
state = next_state
action = next_action
print("Learned Q-values:\n")
for state in Q:
print(state, Q[state])
policy = {}
for state in Q:
policy[state] = max(Q[state],key=Q[state].get)
print("\nPolicy Table(BEst action per state):")
for state in sorted(policy):
print(state,"->", policy[state])
---------------------------------------------------------------------------
7 Demonstration of Monte Carlo Method on Grid Problem
import random
from collections import defaultdict
num_simulations = 10000
grid_size = (3,3)
Q = defaultdict(lambda:defaultdict(float))
returns = defaultdict(lambda:defaultdict(list))
actions = {"down":(1,0),"right":(0,1)}
actions
for _ in range(num_simulations):
start_x = random.randint(0, grid_size[0] - 1)
start_y = random.randint(0, grid_size[1] - 1)
current_position = (start_x, start_y)
while current_position == (2, 2):
start_x = random.randint(0, grid_size[0] - 1)
start_y = random.randint(0, grid_size[1] - 1)
current_position = (start_x, start_y)
episode = []
path_length = 0
while current_position != (2, 2):
valid_actions = []
for action, (dx, dy) in actions.items():
new_x,new_y = current_position[0] + dx,current_position[1] + dy
if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]:
valid_actions.append(action)
chosen_action = random.choice(valid_actions)
dx, dy = actions[chosen_action]
next_position = current_position[0] + dx,current_position[1] + dy
episode.append((current_position, chosen_action))
current_position = next_position
path_length += 1
G = path_length
for state, action in episode:
if G not in returns[state][action]:
returns[state][action].append(G)
Q[state][action] = sum(returns[state][action]) / len(returns[state][action])
for state,actions in Q.items():
for action,value in actions.items():
print(f"state: {state},Action: {action}:{value:.2f}")
-----------------------------------------------------------------------------------
Practical 11– Demonstrate TD Learning [TD(0)]
# Single Epi
import numpy as np
# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states} # Initialize Values
V["Goal"] = 1 # Terminal state
# Parameters
alpha = 0.5
gamma = 1
# Simulated Episode: A -> B -> C -> Goal
episode = [("A", "B"), ("B", "C"), ("C", "Goal")]
rewards = {("C", "Goal"): 1}
# Run TD(0) Updates
for (s, s_next) in episode:
reward = rewards.get((s, s_next), 0)
td_error = reward + gamma * V[s_next] - V[s]
V[s] += alpha * td_error
# Print final state values
print("TD(0) Updated Values:", V)
# Multi Epi
import numpy as np
# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states}
V["Goal"] = 1
# Parameters
alpha = 0.1
gamma = 0.8
n_episodes = 5
# Simulated episode: A -> B -> C -> Goal
transitions = {"A": "B", "B": "C", "C": "Goal"}
rewards = {("C", "Goal"): 1}
for episodes in range(n_episodes):
state = "A"
while state != "Goal":
next_state = transitions[state]
# Run TD(0) Updates
reward = rewards.get((state, next_state), 0)
td_error = reward + gamma * V[next_state] - V[state]
V[state] += alpha * td_error
state = next_state
print(V)
# Print final state values
print("TD(0) Updated Values:", n_episodes, ":", V)