cater

Guest

Mar 12th, 2026

340

Never

Not a member of GistPad yet? Sign Up, it unlocks many cool features!

None 37.40 KB | None | 0 0

copy raw download clone embed print report

1.Implementation of Q-Learning with bellman Equation in linear grid environment
import numpy as np
#define the environment(gridworld)
n_states = 5 #Number of states (0,1,2,3,4)
actions = [0,1] # Actions: 0=left , 1=right
rewards = [-1,-1,-1,-1,10]
goal_state =4 #Goal state
alpha = 0.1 #Learning rate
gamma = 0.9 #Discount Factor
epsilon = 0.1 #Exploration probability
#Initialize Q-table (n_states x actions)
q_table = np.zeros((n_states, len(actions)))
q_table
n_episodes = 1000
for episode in range(n_episodes):
state = 0 #Start state
while state != goal_state:
#choose action (epsilon-greedy)
if np.random.rand() < epsilon:
action = np.random.choice(actions)
else:
action = np.argmax(q_table[state])
#Take action and observe new state and reward
new_state = state + 1 if action == 1 else max(0, state-1)
reward = rewards[new_state]
#Update Q-value 3.06 using the Q-Learning formula
q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
state = new_state
q_table
state = 0
path = [state]
while state != goal_state:
action = np.argmax(q_table[state])
state = state+1 if action==1 else max(0,state-1)
path.append(state)
print("Policy is",path)
-------------------------------------------------------------------------------
2.Creating Q table and setting optimal path using Q-learning on 4x4 grid
import numpy as np
import random
grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
#Parameters
alpha = 0.1 #Learning rate
gamma = 0.9 #Discount factor
epsilon = 0.1 #Exploration rate
episodes = 1000
#Initialize Q-table
q_table = np.zeros((grid_size,grid_size,len(actions)))
q_table
#Helper function
def get_next_state(state,action):
r,c=state
dr,dc=action_map[action]
new_r,new_c=r+dr,c+dc
if 0<=new_r<grid_size and 0<=new_c<grid_size:
return (new_r,new_c)
return state # if out of bounds,stay in the same state
def get_reward(state):
return 10 if state==goal_state else 0
#Q-Learning Algo
for episode in range(episodes):
state=(0,0)
while state != goal_state:
if random.uniform(0,1)<epsilon:
action=random.choice(actions) # explore
else:
action=actions[np.argmax(q_table[state[0],state[1]])] # exploit
#take action
next_state=get_next_state(state,action)
reward=get_reward(next_state)
#update Q-value
action_index=actions.index(action)
best_next_q=np.max(q_table[next_state[0],next_state[1]])
q_table[state[0],state[1],action_index] += alpha *(
reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation
)
state=next_state
#print Q-table
print("Trained Q-table:")
print(q_table)
# Test the agent
state = (0,0)
path = [state]
while state != goal_state:
action = actions[np.argmax(q_table[state[0],state[1]])]
state = get_next_state(state,action)
path.append(state)
print("Optimal path:")
print(path)
--------------------------------------------------------------------------
3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach
import numpy as np
states = ['Start','Coin','Enemy']
actions = ['Go_Coin','Go_Enemy']
transition_rewards = {
'Start':{'Go_Coin':10,'Go_Enemy':-5},
'Coin':{},
'Enemy':{}
}
transition_probs = {
'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}},
'Coin':{},
'Enemy':{}
}
states,actions
transition_rewards
transition_probs
gamma = 0.9
theta = 0.001
value_function = {state:0 for state in states}
value_function
while True:
delta=0
new_value_function = value_function.copy()
for state in states:
if state in['Coin','Enemy']:
continue
state_values = []
for action in actions:
value=0
for next_state,prob in transition_probs[state][action].items():
reward = transition_rewards[state][action]
print(reward)
print(prob)
print(value_function[next_state])
value += prob*(reward + gamma * value_function[next_state])
print(value)
state_values.append(value)
print("s_v",state_values)
new_value_function[state]=max(state_values)
print("n_v",new_value_function)
delta=max(delta,abs(new_value_function[state]-value_function[state]))
value_function=new_value_function
print("v_f",value_function)
if delta<theta:
break
value_function
policy = {}
for state in states:
if state in ['Coin','Enemy']:
policy[state]='T'
continue
action_values={}
for action in actions:
value=0
for next_state,prob in transition_probs[state][action].items():
reward = transition_rewards[state][action]
value += prob * (reward + gamma * value_function[next_state])
action_values[action]= value
policy[state]=max(action_values,key=action_values.get)
print("Optimal value function:", value_function)
print("Optimal policy:",policy)
-----------------------------------------------------------------------------------------
4 Creating policy table for grid problem using MDP
import numpy as np
import random
grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
#Parameters
theta = 1e-4 #Learning rate
gamma = 0.9 #Discount factor
rewards = np.zeros((grid_size,grid_size))
rewards[goal_state]=10
#Helper function
def get_next_state(state,action):
r,c=state
dr,dc=action_map[action]
new_r,new_c=r+dr,c+dc
if 0<=new_r<grid_size and 0<=new_c<grid_size:
return (new_r,new_c)
return state # if out of bounds,stay in the same state
value_table = np.zeros((grid_size,grid_size))
value_table
while True:
delta=0
new_value_table=np.copy(value_table)
for r in range(grid_size):
for c in range(grid_size):
state=(r,c)
if state==goal_state:
continue # skip goal state
# calc the value of the state using the bellmans eqn
state_values=[]
for action in actions:
next_state=get_next_state(state,action)
reward=rewards[next_state]
state_values.append(reward+gamma*value_table[next_state])
# Update the value table
new_value_table[state]=max(state_values)
delta=max(delta,abs(new_value_table[state]-value_table[state]))
value_table=new_value_table
if delta<theta:
break
print("optimal value table:")
print(value_table)
--------------------------------------------------------------------------
5 Using Q-learning and MDP on grid problem
import numpy as np
import random
# Environment setup
grid_size = 4
goal_state = (3,3)
actions = ['up', 'down', 'left', 'right']
action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)} # Positional changes due to action
print(action_map)
# Parameters
alpha = 0.1 # learning factor
gamma = 0.9 # discount factor (for future rewards)
epsilon = 0.1 # exploration factor
episodes = 1000 # no of training episodes/iterations
theta = 1e-4 # Convergence threshold for value iteration
# Reward and table initialization
rewards = np.zeros((grid_size, grid_size))
rewards[goal_state] = 10
q_table = np.zeros((grid_size, grid_size, len(actions))) # (3D) 16 X 16
value_table = np.zeros((grid_size, grid_size)) # 4 X 4
# Helper functions
def get_next_state(state, action):
row, col = state
d_row, d_col = action_map[action]
new_row = row + d_row
new_col = col + d_col
if 0<=new_row<grid_size and 0<=new_col<grid_size:
return (new_row, new_col)
else:
return state # if out of bounds, stay in the same state
# Q-learning with Value Iteration Influence
for episode in range(episodes):
state = (0,0) # start state
while state != goal_state:
# choose action (epsilon-greedy)
if random.uniform(0,1) < epsilon:
action = random.choice(actions) # Explore
else:
action = actions[np.argmax(q_table[state[0], state[1]])] # Exploit
# Take action
next_state = get_next_state(state, action)
reward = rewards[next_state]
# Q-learning update ( Bellman equation influences here)
action_index = actions.index(action)
best_next_q = np.max(q_table[next_state[0], next_state[1]])
q_table[state[0], state[1], action_index] += alpha * (reward + gamma *
best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value
# Update Value Table using Bellman Equation
state_values = []
for a in actions:
next_s = get_next_state(state, a)
state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]])) # Bellman Equation using Q-value
value_table[state] = max(state_values)
# Move to next state
state = next_state
# Derive Policy from Q-learning
policy = np.zeros((grid_size, grid_size), dtype=str)
for r in range(grid_size):
for c in range(grid_size):
state = (r, c)
if state == goal_state:
policy[state] = 'G' # Goal
continue
action_index = np.argmax(q_table[state[0], state[1]])
policy[state] = actions[action_index][0].upper()
print('Q-learning Q-Table:')
print(q_table)
print('\n Value Table (Updated with Bellman Influence): ')
print(value_table)
print('\n Optimal Policy from Q-learning: ')
for row in policy:
print(' '.join(row))
---------------------------------------------------------------------------------------------
6.Demonstrate solving 3x3 Grid problem using SARSA Approach
import random
ROWS, COLS = 3,3
ACTIONS = ["UP","DOWN","LEFT","RIGHT"]
Q = {}
for r in range(ROWS):
for c in range(COLS):
Q[(r,c)] = {a: 0.0 for a in ACTIONS}
print(Q)
alpha = 0.1 # learning rate
gamma = 0.9 # discount facor
epsilon = 0.2 # exploration rate
episodes = 100
def choose_action(state):
if random.random() < epsilon:
return random.choice(ACTIONS)
return max(Q[state], key=Q[state].get)
def step(state,action):
r, c =state
if action == "UP" and r> 0:
r -= 1
elif action == "DOWN" and r< ROWS - 1:
r += 1
elif action == "LEFT" and c > 0:
c -= 1
elif action == "RIGHT" and c < COLS - 1:
c += 1
next_state = (r,c)
if next_state == (0,2):
return next_state,10, True
else:
return next_state, -1, False
for ep in range(episodes):
state = (0,0)
action = choose_action(state)
done = False
while not done:
next_state, reward , done = step(state, action)
if not done:
next_action = choose_action(next_state)
next_q = Q[next_state][next_action]
else:
next_action = None
next_q = 0
Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action])
state = next_state
action = next_action
print("Learned Q-values:\n")
for state in Q:
print(state, Q[state])
policy = {}
for state in Q:
policy[state] = max(Q[state],key=Q[state].get)
print("\nPolicy Table(BEst action per state):")
for state in sorted(policy):
print(state,"->", policy[state])
---------------------------------------------------------------------------
7 Demonstration of Monte Carlo Method on Grid Problem
import random
from collections import defaultdict
num_simulations = 10000
grid_size = (3,3)
Q = defaultdict(lambda:defaultdict(float))
returns = defaultdict(lambda:defaultdict(list))
actions = {"down":(1,0),"right":(0,1)}
actions
for _ in range(num_simulations):
start_x = random.randint(0, grid_size[0] - 1)
start_y = random.randint(0, grid_size[1] - 1)
current_position = (start_x, start_y)
while current_position == (2, 2):
start_x = random.randint(0, grid_size[0] - 1)
start_y = random.randint(0, grid_size[1] - 1)
current_position = (start_x, start_y)
episode = []
path_length = 0
while current_position != (2, 2):
valid_actions = []
for action, (dx, dy) in actions.items():
new_x,new_y = current_position[0] + dx,current_position[1] + dy
if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]:
valid_actions.append(action)
chosen_action = random.choice(valid_actions)
dx, dy = actions[chosen_action]
next_position = current_position[0] + dx,current_position[1] + dy
episode.append((current_position, chosen_action))
current_position = next_position
path_length += 1
G = path_length
for state, action in episode:
if G not in returns[state][action]:
returns[state][action].append(G)
Q[state][action] = sum(returns[state][action]) / len(returns[state][action])
for state,actions in Q.items():
for action,value in actions.items():
print(f"state: {state},Action: {action}:{value:.2f}")
-----------------------------------------------------------------------------------
Practical 11– Demonstrate TD Learning [TD(0)]
# Single Epi
import numpy as np
# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states} # Initialize Values
V["Goal"] = 1 # Terminal state
# Parameters
alpha = 0.5
gamma = 1
# Simulated Episode: A -> B -> C -> Goal
episode = [("A", "B"), ("B", "C"), ("C", "Goal")]
rewards = {("C", "Goal"): 1}
# Run TD(0) Updates
for (s, s_next) in episode:
reward = rewards.get((s, s_next), 0)
td_error = reward + gamma * V[s_next] - V[s]
V[s] += alpha * td_error
# Print final state values
print("TD(0) Updated Values:", V)
# Multi Epi
import numpy as np
# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states}
V["Goal"] = 1
# Parameters
alpha = 0.1
gamma = 0.8
n_episodes = 5
# Simulated episode: A -> B -> C -> Goal
transitions = {"A": "B", "B": "C", "C": "Goal"}
rewards = {("C", "Goal"): 1}
for episodes in range(n_episodes):
state = "A"
while state != "Goal":
next_state = transitions[state]
# Run TD(0) Updates
reward = rewards.get((state, next_state), 0)
td_error = reward + gamma * V[next_state] - V[state]
V[state] += alpha * td_error
state = next_state
print(V)
# Print final state values
print("TD(0) Updated Values:", n_episodes, ":", V)

RAW Paste Data Copied

1.Implementation of Q-Learning with bellman Equation in linear grid environment

import numpy as np
#define the environment(gridworld)
n_states = 5 #Number of states (0,1,2,3,4)
actions = [0,1] # Actions: 0=left , 1=right
rewards = [-1,-1,-1,-1,10]
goal_state =4 #Goal state
alpha = 0.1 #Learning rate
gamma = 0.9 #Discount Factor
epsilon = 0.1 #Exploration probability

#Initialize Q-table (n_states x actions)
q_table = np.zeros((n_states, len(actions)))
q_table

n_episodes = 1000
for episode in range(n_episodes):
  state = 0 #Start state
  while state != goal_state:
    #choose action (epsilon-greedy)
    if np.random.rand() < epsilon:
      action = np.random.choice(actions)
    else:
      action = np.argmax(q_table[state])
    #Take action and observe new state and reward
    new_state = state + 1 if action == 1 else max(0, state-1)
    reward = rewards[new_state]
    #Update Q-value 3.06 using the Q-Learning formula
    q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
    state = new_state

q_table

state = 0
path = [state]
while state != goal_state:
  action = np.argmax(q_table[state])
  state = state+1 if action==1 else max(0,state-1)
  path.append(state)

print("Policy is",path)

-------------------------------------------------------------------------------
2.Creating Q table and setting optimal path using Q-learning on 4x4 grid

import numpy as np
import random

grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}

#Parameters
alpha = 0.1 #Learning rate
gamma = 0.9  #Discount factor
epsilon = 0.1  #Exploration rate
episodes = 1000
#Initialize Q-table
q_table = np.zeros((grid_size,grid_size,len(actions)))
q_table

#Helper function
def get_next_state(state,action):
  r,c=state
  dr,dc=action_map[action]
  new_r,new_c=r+dr,c+dc
  if 0<=new_r<grid_size and 0<=new_c<grid_size:
    return (new_r,new_c)
  return state # if out of bounds,stay in  the same state

def get_reward(state):
  return 10 if state==goal_state else 0

#Q-Learning Algo
for episode in range(episodes):
  state=(0,0)
  while state != goal_state:
    if random.uniform(0,1)<epsilon:
      action=random.choice(actions) # explore
    else:
      action=actions[np.argmax(q_table[state[0],state[1]])] # exploit

#take action
    next_state=get_next_state(state,action)
    reward=get_reward(next_state)
 #update Q-value
    action_index=actions.index(action)
    best_next_q=np.max(q_table[next_state[0],next_state[1]])
    q_table[state[0],state[1],action_index] += alpha *(
        reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation
    )

state=next_state

#print Q-table
print("Trained Q-table:")
print(q_table)

# Test the agent
state = (0,0)
path = [state]
while state != goal_state:
  action = actions[np.argmax(q_table[state[0],state[1]])]
  state = get_next_state(state,action)
  path.append(state)

print("Optimal path:")
print(path)

--------------------------------------------------------------------------
3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach

import numpy as np
states = ['Start','Coin','Enemy']
actions = ['Go_Coin','Go_Enemy']
transition_rewards = {
    'Start':{'Go_Coin':10,'Go_Enemy':-5},
    'Coin':{},
    'Enemy':{}
}
transition_probs = {
    'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}},
    'Coin':{},
    'Enemy':{}
}

states,actions
transition_rewards
transition_probs
gamma = 0.9
theta = 0.001
value_function = {state:0 for state in states}
value_function

while True:
  delta=0
  new_value_function = value_function.copy()
  for state in states:
    if state in['Coin','Enemy']:
      continue
    state_values = []
    for action in actions:
      value=0
      for next_state,prob in transition_probs[state][action].items():
        reward = transition_rewards[state][action]
        print(reward)
        print(prob)
        print(value_function[next_state])
        value += prob*(reward + gamma * value_function[next_state])
        print(value)

state_values.append(value)
      print("s_v",state_values)
    new_value_function[state]=max(state_values)
    print("n_v",new_value_function)
    delta=max(delta,abs(new_value_function[state]-value_function[state]))

value_function=new_value_function
  print("v_f",value_function)
  if delta<theta:
    break

value_function

policy = {}
for state in states:
  if state in ['Coin','Enemy']:
    policy[state]='T'
    continue
  action_values={}
  for action in actions:
    value=0
    for next_state,prob in transition_probs[state][action].items():
      reward = transition_rewards[state][action]
      value += prob * (reward + gamma * value_function[next_state])
    action_values[action]= value
  policy[state]=max(action_values,key=action_values.get)
print("Optimal value function:", value_function)
print("Optimal policy:",policy)

-----------------------------------------------------------------------------------------

4 Creating policy table for grid problem using MDP

import numpy as np
import random

grid_size = 4
goal_size = (3,3)
goal_state = goal_size
actions = ['up','down','left','right']
action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}

#Parameters
theta = 1e-4 #Learning rate
gamma = 0.9  #Discount factor
rewards = np.zeros((grid_size,grid_size))
rewards[goal_state]=10

value_table = np.zeros((grid_size,grid_size))
value_table

while True:
    delta=0
    new_value_table=np.copy(value_table)
    for r in range(grid_size):
        for c in range(grid_size):
            state=(r,c)
            if state==goal_state:
                continue # skip goal state

# calc the value of the state using the bellmans eqn
            state_values=[]
            for action in actions:
                next_state=get_next_state(state,action)
                reward=rewards[next_state]
                state_values.append(reward+gamma*value_table[next_state])

# Update the value table
            new_value_table[state]=max(state_values)
            delta=max(delta,abs(new_value_table[state]-value_table[state]))

value_table=new_value_table
    if delta<theta:
        break
print("optimal value table:")
print(value_table)

--------------------------------------------------------------------------
5 Using Q-learning and MDP on grid problem

import numpy as np
import random

# Environment setup
grid_size = 4
goal_state = (3,3)
actions = ['up', 'down', 'left', 'right']
action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)}    # Positional changes due to action
print(action_map)
# Parameters
alpha = 0.1       # learning factor
gamma = 0.9       # discount factor (for future rewards)
epsilon = 0.1     # exploration factor
episodes = 1000   # no of training episodes/iterations
theta = 1e-4      # Convergence threshold for value iteration

# Reward and table initialization
rewards = np.zeros((grid_size, grid_size))
rewards[goal_state] = 10
q_table = np.zeros((grid_size, grid_size, len(actions)))    # (3D) 16 X 16
value_table = np.zeros((grid_size, grid_size))       # 4 X 4

# Helper functions
def get_next_state(state, action):
    row, col = state
    d_row, d_col = action_map[action]
    new_row = row + d_row
    new_col = col + d_col
    if 0<=new_row<grid_size and 0<=new_col<grid_size:
        return (new_row, new_col)
    else:
        return state            # if out of bounds, stay in the same state

# Q-learning with Value Iteration Influence
for episode in range(episodes):
  state = (0,0)        # start state
  while state != goal_state:
    # choose action (epsilon-greedy)
    if random.uniform(0,1) < epsilon:
      action = random.choice(actions)       # Explore
    else:
      action = actions[np.argmax(q_table[state[0], state[1]])]   # Exploit

# Take action
    next_state = get_next_state(state, action)
    reward = rewards[next_state]

# Q-learning update ( Bellman equation influences here)
    action_index = actions.index(action)
    best_next_q = np.max(q_table[next_state[0], next_state[1]])
    q_table[state[0], state[1], action_index] += alpha * (reward + gamma *
                      best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value

# Update Value Table using Bellman Equation
    state_values = []
    for a in actions:
      next_s = get_next_state(state, a)
      state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]]))    # Bellman Equation using Q-value

value_table[state] = max(state_values)

# Move to next state
    state = next_state

# Derive Policy from Q-learning
policy = np.zeros((grid_size, grid_size), dtype=str)
for r in range(grid_size):
  for c in range(grid_size):
    state = (r, c)
    if state == goal_state:
      policy[state] = 'G'          # Goal
      continue

action_index = np.argmax(q_table[state[0], state[1]])
    policy[state] = actions[action_index][0].upper()

print('Q-learning Q-Table:')
print(q_table)

print('\n Value Table (Updated with Bellman Influence): ')
print(value_table)

print('\n Optimal Policy from Q-learning: ')
for row in policy:
  print(' '.join(row))

---------------------------------------------------------------------------------------------
6.Demonstrate solving 3x3 Grid problem using SARSA Approach

import random
ROWS, COLS = 3,3
ACTIONS = ["UP","DOWN","LEFT","RIGHT"]

Q = {}
for r in range(ROWS):
  for c in range(COLS):
    Q[(r,c)] = {a: 0.0 for a in ACTIONS}
print(Q)

alpha = 0.1   # learning rate
gamma = 0.9   # discount facor
epsilon = 0.2 # exploration rate
episodes = 100

def choose_action(state):
  if random.random() < epsilon:
    return random.choice(ACTIONS)
  return max(Q[state], key=Q[state].get)

def step(state,action):
  r, c =state

if action == "UP" and r> 0:
    r -= 1
  elif action == "DOWN" and r< ROWS - 1:
    r += 1
  elif action == "LEFT" and c > 0:
    c -= 1
  elif action == "RIGHT" and c < COLS - 1:
    c += 1

next_state = (r,c)

if next_state == (0,2):
    return next_state,10, True
  else:
    return next_state, -1, False

for ep in range(episodes):
  state = (0,0)
  action = choose_action(state)
  done = False

while not done:
    next_state, reward , done = step(state, action)

if not done:
      next_action = choose_action(next_state)
      next_q = Q[next_state][next_action]
    else:
      next_action = None
      next_q = 0
    Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action])
    state = next_state
    action = next_action

print("Learned Q-values:\n")
for state in Q:
  print(state, Q[state])

policy = {}
for state in Q:
  policy[state] = max(Q[state],key=Q[state].get)
print("\nPolicy Table(BEst action per state):")
for state in sorted(policy):
  print(state,"->", policy[state])

---------------------------------------------------------------------------
7 Demonstration of Monte Carlo Method on Grid Problem

import random
from collections import defaultdict

num_simulations = 10000
grid_size = (3,3)
Q = defaultdict(lambda:defaultdict(float))

returns = defaultdict(lambda:defaultdict(list))
actions = {"down":(1,0),"right":(0,1)}
actions

for _ in range(num_simulations):
  start_x = random.randint(0, grid_size[0] - 1)
  start_y = random.randint(0, grid_size[1] - 1)
  current_position = (start_x, start_y)
  while current_position == (2, 2):
    start_x = random.randint(0, grid_size[0] - 1)
    start_y = random.randint(0, grid_size[1] - 1)
    current_position = (start_x, start_y)
  episode = []
  path_length = 0
  while current_position != (2, 2):
    valid_actions = []
    for action, (dx, dy) in actions.items():
      new_x,new_y = current_position[0] + dx,current_position[1] + dy
      if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]:
        valid_actions.append(action)
    chosen_action = random.choice(valid_actions)
    dx, dy = actions[chosen_action]

next_position = current_position[0] + dx,current_position[1] + dy
    episode.append((current_position, chosen_action))
    current_position = next_position
    path_length += 1
  G = path_length
  for state, action in episode:
    if G not in returns[state][action]:
      returns[state][action].append(G)

Q[state][action] = sum(returns[state][action]) / len(returns[state][action])

for state,actions in Q.items():
  for action,value in actions.items():
    print(f"state: {state},Action: {action}:{value:.2f}")

-----------------------------------------------------------------------------------
Practical 11– Demonstrate TD Learning [TD(0)]

# Single Epi

import numpy as np

# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states}  # Initialize Values
V["Goal"] = 1  # Terminal state

# Parameters
alpha = 0.5
gamma = 1

# Simulated Episode: A -> B -> C -> Goal
episode = [("A", "B"), ("B", "C"), ("C", "Goal")]
rewards = {("C", "Goal"): 1}

# Run TD(0) Updates
for (s, s_next) in episode:
    reward = rewards.get((s, s_next), 0)
    td_error = reward + gamma * V[s_next] - V[s]
    V[s] += alpha * td_error

# Print final state values
print("TD(0) Updated Values:", V)

# Multi Epi
import numpy as np

# Define states and values
states = ["A", "B", "C", "Goal"]
V = {s: 0 for s in states}
V["Goal"] = 1

# Parameters
alpha = 0.1
gamma = 0.8
n_episodes = 5

# Simulated episode: A -> B -> C -> Goal
transitions = {"A": "B", "B": "C", "C": "Goal"}
rewards = {("C", "Goal"): 1}

for episodes in range(n_episodes):
    state = "A"
    
    while state != "Goal":
        next_state = transitions[state]

# Run TD(0) Updates
        reward = rewards.get((state, next_state), 0)
        td_error = reward + gamma * V[next_state] - V[state]
        V[state] += alpha * td_error

state = next_state

print(V)

# Print final state values
print("TD(0) Updated Values:", n_episodes, ":", V)