Not a member of gistpad yet?
Sign Up,
it unlocks many cool features!
- 1.Implementation of Q-Learning with bellman Equation in linear grid environment
- import numpy as np
- #define the environment(gridworld)
- n_states = 5 #Number of states (0,1,2,3,4)
- actions = [0,1] # Actions: 0=left , 1=right
- rewards = [-1,-1,-1,-1,10]
- goal_state =4 #Goal state
- alpha = 0.1 #Learning rate
- gamma = 0.9 #Discount Factor
- epsilon = 0.1 #Exploration probability
- #Initialize Q-table (n_states x actions)
- q_table = np.zeros((n_states, len(actions)))
- q_table
- n_episodes = 1000
- for episode in range(n_episodes):
- state = 0 #Start state
- while state != goal_state:
- #choose action (epsilon-greedy)
- if np.random.rand() < epsilon:
- action = np.random.choice(actions)
- else:
- action = np.argmax(q_table[state])
- #Take action and observe new state and reward
- new_state = state + 1 if action == 1 else max(0, state-1)
- reward = rewards[new_state]
- #Update Q-value 3.06 using the Q-Learning formula
- q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
- state = new_state
- q_table
- state = 0
- path = [state]
- while state != goal_state:
- action = np.argmax(q_table[state])
- state = state+1 if action==1 else max(0,state-1)
- path.append(state)
- print("Policy is",path)
- -------------------------------------------------------------------------------
- 2.Creating Q table and setting optimal path using Q-learning on 4x4 grid
- import numpy as np
- import random
- grid_size = 4
- goal_size = (3,3)
- goal_state = goal_size
- actions = ['up','down','left','right']
- action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
- #Parameters
- alpha = 0.1 #Learning rate
- gamma = 0.9 #Discount factor
- epsilon = 0.1 #Exploration rate
- episodes = 1000
- #Initialize Q-table
- q_table = np.zeros((grid_size,grid_size,len(actions)))
- q_table
- #Helper function
- def get_next_state(state,action):
- r,c=state
- dr,dc=action_map[action]
- new_r,new_c=r+dr,c+dc
- if 0<=new_r<grid_size and 0<=new_c<grid_size:
- return (new_r,new_c)
- return state # if out of bounds,stay in the same state
- def get_reward(state):
- return 10 if state==goal_state else 0
- #Q-Learning Algo
- for episode in range(episodes):
- state=(0,0)
- while state != goal_state:
- if random.uniform(0,1)<epsilon:
- action=random.choice(actions) # explore
- else:
- action=actions[np.argmax(q_table[state[0],state[1]])] # exploit
- #take action
- next_state=get_next_state(state,action)
- reward=get_reward(next_state)
- #update Q-value
- action_index=actions.index(action)
- best_next_q=np.max(q_table[next_state[0],next_state[1]])
- q_table[state[0],state[1],action_index] += alpha *(
- reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation
- )
- state=next_state
- #print Q-table
- print("Trained Q-table:")
- print(q_table)
- # Test the agent
- state = (0,0)
- path = [state]
- while state != goal_state:
- action = actions[np.argmax(q_table[state[0],state[1]])]
- state = get_next_state(state,action)
- path.append(state)
- print("Optimal path:")
- print(path)
- --------------------------------------------------------------------------
- 3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach
- import numpy as np
- states = ['Start','Coin','Enemy']
- actions = ['Go_Coin','Go_Enemy']
- transition_rewards = {
- 'Start':{'Go_Coin':10,'Go_Enemy':-5},
- 'Coin':{},
- 'Enemy':{}
- }
- transition_probs = {
- 'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}},
- 'Coin':{},
- 'Enemy':{}
- }
- states,actions
- transition_rewards
- transition_probs
- gamma = 0.9
- theta = 0.001
- value_function = {state:0 for state in states}
- value_function
- while True:
- delta=0
- new_value_function = value_function.copy()
- for state in states:
- if state in['Coin','Enemy']:
- continue
- state_values = []
- for action in actions:
- value=0
- for next_state,prob in transition_probs[state][action].items():
- reward = transition_rewards[state][action]
- print(reward)
- print(prob)
- print(value_function[next_state])
- value += prob*(reward + gamma * value_function[next_state])
- print(value)
- state_values.append(value)
- print("s_v",state_values)
- new_value_function[state]=max(state_values)
- print("n_v",new_value_function)
- delta=max(delta,abs(new_value_function[state]-value_function[state]))
- value_function=new_value_function
- print("v_f",value_function)
- if delta<theta:
- break
- value_function
- policy = {}
- for state in states:
- if state in ['Coin','Enemy']:
- policy[state]='T'
- continue
- action_values={}
- for action in actions:
- value=0
- for next_state,prob in transition_probs[state][action].items():
- reward = transition_rewards[state][action]
- value += prob * (reward + gamma * value_function[next_state])
- action_values[action]= value
- policy[state]=max(action_values,key=action_values.get)
- print("Optimal value function:", value_function)
- print("Optimal policy:",policy)
- -----------------------------------------------------------------------------------------
- 4 Creating policy table for grid problem using MDP
- import numpy as np
- import random
- grid_size = 4
- goal_size = (3,3)
- goal_state = goal_size
- actions = ['up','down','left','right']
- action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
- #Parameters
- theta = 1e-4 #Learning rate
- gamma = 0.9 #Discount factor
- rewards = np.zeros((grid_size,grid_size))
- rewards[goal_state]=10
- #Helper function
- def get_next_state(state,action):
- r,c=state
- dr,dc=action_map[action]
- new_r,new_c=r+dr,c+dc
- if 0<=new_r<grid_size and 0<=new_c<grid_size:
- return (new_r,new_c)
- return state # if out of bounds,stay in the same state
- value_table = np.zeros((grid_size,grid_size))
- value_table
- while True:
- delta=0
- new_value_table=np.copy(value_table)
- for r in range(grid_size):
- for c in range(grid_size):
- state=(r,c)
- if state==goal_state:
- continue # skip goal state
- # calc the value of the state using the bellmans eqn
- state_values=[]
- for action in actions:
- next_state=get_next_state(state,action)
- reward=rewards[next_state]
- state_values.append(reward+gamma*value_table[next_state])
- # Update the value table
- new_value_table[state]=max(state_values)
- delta=max(delta,abs(new_value_table[state]-value_table[state]))
- value_table=new_value_table
- if delta<theta:
- break
- print("optimal value table:")
- print(value_table)
- --------------------------------------------------------------------------
- 5 Using Q-learning and MDP on grid problem
- import numpy as np
- import random
- # Environment setup
- grid_size = 4
- goal_state = (3,3)
- actions = ['up', 'down', 'left', 'right']
- action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)} # Positional changes due to action
- print(action_map)
- # Parameters
- alpha = 0.1 # learning factor
- gamma = 0.9 # discount factor (for future rewards)
- epsilon = 0.1 # exploration factor
- episodes = 1000 # no of training episodes/iterations
- theta = 1e-4 # Convergence threshold for value iteration
- # Reward and table initialization
- rewards = np.zeros((grid_size, grid_size))
- rewards[goal_state] = 10
- q_table = np.zeros((grid_size, grid_size, len(actions))) # (3D) 16 X 16
- value_table = np.zeros((grid_size, grid_size)) # 4 X 4
- # Helper functions
- def get_next_state(state, action):
- row, col = state
- d_row, d_col = action_map[action]
- new_row = row + d_row
- new_col = col + d_col
- if 0<=new_row<grid_size and 0<=new_col<grid_size:
- return (new_row, new_col)
- else:
- return state # if out of bounds, stay in the same state
- # Q-learning with Value Iteration Influence
- for episode in range(episodes):
- state = (0,0) # start state
- while state != goal_state:
- # choose action (epsilon-greedy)
- if random.uniform(0,1) < epsilon:
- action = random.choice(actions) # Explore
- else:
- action = actions[np.argmax(q_table[state[0], state[1]])] # Exploit
- # Take action
- next_state = get_next_state(state, action)
- reward = rewards[next_state]
- # Q-learning update ( Bellman equation influences here)
- action_index = actions.index(action)
- best_next_q = np.max(q_table[next_state[0], next_state[1]])
- q_table[state[0], state[1], action_index] += alpha * (reward + gamma *
- best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value
- # Update Value Table using Bellman Equation
- state_values = []
- for a in actions:
- next_s = get_next_state(state, a)
- state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]])) # Bellman Equation using Q-value
- value_table[state] = max(state_values)
- # Move to next state
- state = next_state
- # Derive Policy from Q-learning
- policy = np.zeros((grid_size, grid_size), dtype=str)
- for r in range(grid_size):
- for c in range(grid_size):
- state = (r, c)
- if state == goal_state:
- policy[state] = 'G' # Goal
- continue
- action_index = np.argmax(q_table[state[0], state[1]])
- policy[state] = actions[action_index][0].upper()
- print('Q-learning Q-Table:')
- print(q_table)
- print('\n Value Table (Updated with Bellman Influence): ')
- print(value_table)
- print('\n Optimal Policy from Q-learning: ')
- for row in policy:
- print(' '.join(row))
- ---------------------------------------------------------------------------------------------
- 6.Demonstrate solving 3x3 Grid problem using SARSA Approach
- import random
- ROWS, COLS = 3,3
- ACTIONS = ["UP","DOWN","LEFT","RIGHT"]
- Q = {}
- for r in range(ROWS):
- for c in range(COLS):
- Q[(r,c)] = {a: 0.0 for a in ACTIONS}
- print(Q)
- alpha = 0.1 # learning rate
- gamma = 0.9 # discount facor
- epsilon = 0.2 # exploration rate
- episodes = 100
- def choose_action(state):
- if random.random() < epsilon:
- return random.choice(ACTIONS)
- return max(Q[state], key=Q[state].get)
- def step(state,action):
- r, c =state
- if action == "UP" and r> 0:
- r -= 1
- elif action == "DOWN" and r< ROWS - 1:
- r += 1
- elif action == "LEFT" and c > 0:
- c -= 1
- elif action == "RIGHT" and c < COLS - 1:
- c += 1
- next_state = (r,c)
- if next_state == (0,2):
- return next_state,10, True
- else:
- return next_state, -1, False
- for ep in range(episodes):
- state = (0,0)
- action = choose_action(state)
- done = False
- while not done:
- next_state, reward , done = step(state, action)
- if not done:
- next_action = choose_action(next_state)
- next_q = Q[next_state][next_action]
- else:
- next_action = None
- next_q = 0
- Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action])
- state = next_state
- action = next_action
- print("Learned Q-values:\n")
- for state in Q:
- print(state, Q[state])
- policy = {}
- for state in Q:
- policy[state] = max(Q[state],key=Q[state].get)
- print("\nPolicy Table(BEst action per state):")
- for state in sorted(policy):
- print(state,"->", policy[state])
- ---------------------------------------------------------------------------
- 7 Demonstration of Monte Carlo Method on Grid Problem
- import random
- from collections import defaultdict
- num_simulations = 10000
- grid_size = (3,3)
- Q = defaultdict(lambda:defaultdict(float))
- returns = defaultdict(lambda:defaultdict(list))
- actions = {"down":(1,0),"right":(0,1)}
- actions
- for _ in range(num_simulations):
- start_x = random.randint(0, grid_size[0] - 1)
- start_y = random.randint(0, grid_size[1] - 1)
- current_position = (start_x, start_y)
- while current_position == (2, 2):
- start_x = random.randint(0, grid_size[0] - 1)
- start_y = random.randint(0, grid_size[1] - 1)
- current_position = (start_x, start_y)
- episode = []
- path_length = 0
- while current_position != (2, 2):
- valid_actions = []
- for action, (dx, dy) in actions.items():
- new_x,new_y = current_position[0] + dx,current_position[1] + dy
- if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]:
- valid_actions.append(action)
- chosen_action = random.choice(valid_actions)
- dx, dy = actions[chosen_action]
- next_position = current_position[0] + dx,current_position[1] + dy
- episode.append((current_position, chosen_action))
- current_position = next_position
- path_length += 1
- G = path_length
- for state, action in episode:
- if G not in returns[state][action]:
- returns[state][action].append(G)
- Q[state][action] = sum(returns[state][action]) / len(returns[state][action])
- for state,actions in Q.items():
- for action,value in actions.items():
- print(f"state: {state},Action: {action}:{value:.2f}")
- -----------------------------------------------------------------------------------
- Practical 11– Demonstrate TD Learning [TD(0)]
- # Single Epi
- import numpy as np
- # Define states and values
- states = ["A", "B", "C", "Goal"]
- V = {s: 0 for s in states} # Initialize Values
- V["Goal"] = 1 # Terminal state
- # Parameters
- alpha = 0.5
- gamma = 1
- # Simulated Episode: A -> B -> C -> Goal
- episode = [("A", "B"), ("B", "C"), ("C", "Goal")]
- rewards = {("C", "Goal"): 1}
- # Run TD(0) Updates
- for (s, s_next) in episode:
- reward = rewards.get((s, s_next), 0)
- td_error = reward + gamma * V[s_next] - V[s]
- V[s] += alpha * td_error
- # Print final state values
- print("TD(0) Updated Values:", V)
- # Multi Epi
- import numpy as np
- # Define states and values
- states = ["A", "B", "C", "Goal"]
- V = {s: 0 for s in states}
- V["Goal"] = 1
- # Parameters
- alpha = 0.1
- gamma = 0.8
- n_episodes = 5
- # Simulated episode: A -> B -> C -> Goal
- transitions = {"A": "B", "B": "C", "C": "Goal"}
- rewards = {("C", "Goal"): 1}
- for episodes in range(n_episodes):
- state = "A"
- while state != "Goal":
- next_state = transitions[state]
- # Run TD(0) Updates
- reward = rewards.get((state, next_state), 0)
- td_error = reward + gamma * V[next_state] - V[state]
- V[state] += alpha * td_error
- state = next_state
- print(V)
- # Print final state values
- print("TD(0) Updated Values:", n_episodes, ":", V)
RAW Gist Data
Copied
