1.Implementation of Q-Learning with bellman Equation in linear grid environment import numpy as np #define the environment(gridworld) n_states = 5 #Number of states (0,1,2,3,4) actions = [0,1] # Actions: 0=left , 1=right rewards = [-1,-1,-1,-1,10] goal_state =4 #Goal state alpha = 0.1 #Learning rate gamma = 0.9 #Discount Factor epsilon = 0.1 #Exploration probability #Initialize Q-table (n_states x actions) q_table = np.zeros((n_states, len(actions))) q_table n_episodes = 1000 for episode in range(n_episodes): state = 0 #Start state while state != goal_state: #choose action (epsilon-greedy) if np.random.rand() < epsilon: action = np.random.choice(actions) else: action = np.argmax(q_table[state]) #Take action and observe new state and reward new_state = state + 1 if action == 1 else max(0, state-1) reward = rewards[new_state] #Update Q-value 3.06 using the Q-Learning formula q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action]) state = new_state q_table state = 0 path = [state] while state != goal_state: action = np.argmax(q_table[state]) state = state+1 if action==1 else max(0,state-1) path.append(state) print("Policy is",path) ------------------------------------------------------------------------------- 2.Creating Q table and setting optimal path using Q-learning on 4x4 grid import numpy as np import random grid_size = 4 goal_size = (3,3) goal_state = goal_size actions = ['up','down','left','right'] action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)} #Parameters alpha = 0.1 #Learning rate gamma = 0.9 #Discount factor epsilon = 0.1 #Exploration rate episodes = 1000 #Initialize Q-table q_table = np.zeros((grid_size,grid_size,len(actions))) q_table #Helper function def get_next_state(state,action): r,c=state dr,dc=action_map[action] new_r,new_c=r+dr,c+dc if 0<=new_r 0: r -= 1 elif action == "DOWN" and r< ROWS - 1: r += 1 elif action == "LEFT" and c > 0: c -= 1 elif action == "RIGHT" and c < COLS - 1: c += 1 next_state = (r,c) if next_state == (0,2): return next_state,10, True else: return next_state, -1, False for ep in range(episodes): state = (0,0) action = choose_action(state) done = False while not done: next_state, reward , done = step(state, action) if not done: next_action = choose_action(next_state) next_q = Q[next_state][next_action] else: next_action = None next_q = 0 Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action]) state = next_state action = next_action print("Learned Q-values:\n") for state in Q: print(state, Q[state]) policy = {} for state in Q: policy[state] = max(Q[state],key=Q[state].get) print("\nPolicy Table(BEst action per state):") for state in sorted(policy): print(state,"->", policy[state]) --------------------------------------------------------------------------- 7 Demonstration of Monte Carlo Method on Grid Problem import random from collections import defaultdict num_simulations = 10000 grid_size = (3,3) Q = defaultdict(lambda:defaultdict(float)) returns = defaultdict(lambda:defaultdict(list)) actions = {"down":(1,0),"right":(0,1)} actions for _ in range(num_simulations): start_x = random.randint(0, grid_size[0] - 1) start_y = random.randint(0, grid_size[1] - 1) current_position = (start_x, start_y) while current_position == (2, 2): start_x = random.randint(0, grid_size[0] - 1) start_y = random.randint(0, grid_size[1] - 1) current_position = (start_x, start_y) episode = [] path_length = 0 while current_position != (2, 2): valid_actions = [] for action, (dx, dy) in actions.items(): new_x,new_y = current_position[0] + dx,current_position[1] + dy if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]: valid_actions.append(action) chosen_action = random.choice(valid_actions) dx, dy = actions[chosen_action] next_position = current_position[0] + dx,current_position[1] + dy episode.append((current_position, chosen_action)) current_position = next_position path_length += 1 G = path_length for state, action in episode: if G not in returns[state][action]: returns[state][action].append(G) Q[state][action] = sum(returns[state][action]) / len(returns[state][action]) for state,actions in Q.items(): for action,value in actions.items(): print(f"state: {state},Action: {action}:{value:.2f}") ----------------------------------------------------------------------------------- Practical 11– Demonstrate TD Learning [TD(0)] # Single Epi import numpy as np # Define states and values states = ["A", "B", "C", "Goal"] V = {s: 0 for s in states} # Initialize Values V["Goal"] = 1 # Terminal state # Parameters alpha = 0.5 gamma = 1 # Simulated Episode: A -> B -> C -> Goal episode = [("A", "B"), ("B", "C"), ("C", "Goal")] rewards = {("C", "Goal"): 1} # Run TD(0) Updates for (s, s_next) in episode: reward = rewards.get((s, s_next), 0) td_error = reward + gamma * V[s_next] - V[s] V[s] += alpha * td_error # Print final state values print("TD(0) Updated Values:", V) # Multi Epi import numpy as np # Define states and values states = ["A", "B", "C", "Goal"] V = {s: 0 for s in states} V["Goal"] = 1 # Parameters alpha = 0.1 gamma = 0.8 n_episodes = 5 # Simulated episode: A -> B -> C -> Goal transitions = {"A": "B", "B": "C", "C": "Goal"} rewards = {("C", "Goal"): 1} for episodes in range(n_episodes): state = "A" while state != "Goal": next_state = transitions[state] # Run TD(0) Updates reward = rewards.get((state, next_state), 0) td_error = reward + gamma * V[next_state] - V[state] V[state] += alpha * td_error state = next_state print(V) # Print final state values print("TD(0) Updated Values:", n_episodes, ":", V)