1. 1.Implementation of Q-Learning with bellman Equation in linear grid environment
  2.  
  3. import numpy as np
  4. #define the environment(gridworld)
  5. n_states = 5 #Number of states (0,1,2,3,4)
  6. actions = [0,1] # Actions: 0=left , 1=right
  7. rewards = [-1,-1,-1,-1,10]
  8. goal_state =4 #Goal state
  9. alpha = 0.1 #Learning rate
  10. gamma = 0.9 #Discount Factor
  11. epsilon = 0.1 #Exploration probability
  12.  
  13. #Initialize Q-table (n_states x actions)
  14. q_table = np.zeros((n_states, len(actions)))
  15. q_table
  16.  
  17. n_episodes = 1000
  18. for episode in range(n_episodes):
  19. state = 0 #Start state
  20. while state != goal_state:
  21. #choose action (epsilon-greedy)
  22. if np.random.rand() < epsilon:
  23. action = np.random.choice(actions)
  24. else:
  25. action = np.argmax(q_table[state])
  26. #Take action and observe new state and reward
  27. new_state = state + 1 if action == 1 else max(0, state-1)
  28. reward = rewards[new_state]
  29. #Update Q-value 3.06 using the Q-Learning formula
  30. q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action])
  31. state = new_state
  32.  
  33. q_table
  34.  
  35. state = 0
  36. path = [state]
  37. while state != goal_state:
  38. action = np.argmax(q_table[state])
  39. state = state+1 if action==1 else max(0,state-1)
  40. path.append(state)
  41.  
  42. print("Policy is",path)
  43.  
  44. -------------------------------------------------------------------------------
  45. 2.Creating Q table and setting optimal path using Q-learning on 4x4 grid
  46.  
  47. import numpy as np
  48. import random
  49.  
  50. grid_size = 4
  51. goal_size = (3,3)
  52. goal_state = goal_size
  53. actions = ['up','down','left','right']
  54. action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
  55.  
  56. #Parameters
  57. alpha = 0.1 #Learning rate
  58. gamma = 0.9 #Discount factor
  59. epsilon = 0.1 #Exploration rate
  60. episodes = 1000
  61. #Initialize Q-table
  62. q_table = np.zeros((grid_size,grid_size,len(actions)))
  63. q_table
  64.  
  65. #Helper function
  66. def get_next_state(state,action):
  67. r,c=state
  68. dr,dc=action_map[action]
  69. new_r,new_c=r+dr,c+dc
  70. if 0<=new_r<grid_size and 0<=new_c<grid_size:
  71. return (new_r,new_c)
  72. return state # if out of bounds,stay in the same state
  73.  
  74. def get_reward(state):
  75. return 10 if state==goal_state else 0
  76.  
  77. #Q-Learning Algo
  78. for episode in range(episodes):
  79. state=(0,0)
  80. while state != goal_state:
  81. if random.uniform(0,1)<epsilon:
  82. action=random.choice(actions) # explore
  83. else:
  84. action=actions[np.argmax(q_table[state[0],state[1]])] # exploit
  85.  
  86. #take action
  87. next_state=get_next_state(state,action)
  88. reward=get_reward(next_state)
  89. #update Q-value
  90. action_index=actions.index(action)
  91. best_next_q=np.max(q_table[next_state[0],next_state[1]])
  92. q_table[state[0],state[1],action_index] += alpha *(
  93. reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation
  94. )
  95.  
  96. state=next_state
  97.  
  98. #print Q-table
  99. print("Trained Q-table:")
  100. print(q_table)
  101.  
  102. # Test the agent
  103. state = (0,0)
  104. path = [state]
  105. while state != goal_state:
  106. action = actions[np.argmax(q_table[state[0],state[1]])]
  107. state = get_next_state(state,action)
  108. path.append(state)
  109.  
  110. print("Optimal path:")
  111. print(path)
  112.  
  113. --------------------------------------------------------------------------
  114. 3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach
  115.  
  116. import numpy as np
  117. states = ['Start','Coin','Enemy']
  118. actions = ['Go_Coin','Go_Enemy']
  119. transition_rewards = {
  120. 'Start':{'Go_Coin':10,'Go_Enemy':-5},
  121. 'Coin':{},
  122. 'Enemy':{}
  123. }
  124. transition_probs = {
  125. 'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}},
  126. 'Coin':{},
  127. 'Enemy':{}
  128. }
  129.  
  130. states,actions
  131. transition_rewards
  132. transition_probs
  133. gamma = 0.9
  134. theta = 0.001
  135. value_function = {state:0 for state in states}
  136. value_function
  137.  
  138. while True:
  139. delta=0
  140. new_value_function = value_function.copy()
  141. for state in states:
  142. if state in['Coin','Enemy']:
  143. continue
  144. state_values = []
  145. for action in actions:
  146. value=0
  147. for next_state,prob in transition_probs[state][action].items():
  148. reward = transition_rewards[state][action]
  149. print(reward)
  150. print(prob)
  151. print(value_function[next_state])
  152. value += prob*(reward + gamma * value_function[next_state])
  153. print(value)
  154.  
  155. state_values.append(value)
  156. print("s_v",state_values)
  157. new_value_function[state]=max(state_values)
  158. print("n_v",new_value_function)
  159. delta=max(delta,abs(new_value_function[state]-value_function[state]))
  160.  
  161. value_function=new_value_function
  162. print("v_f",value_function)
  163. if delta<theta:
  164. break
  165.  
  166. value_function
  167.  
  168. policy = {}
  169. for state in states:
  170. if state in ['Coin','Enemy']:
  171. policy[state]='T'
  172. continue
  173. action_values={}
  174. for action in actions:
  175. value=0
  176. for next_state,prob in transition_probs[state][action].items():
  177. reward = transition_rewards[state][action]
  178. value += prob * (reward + gamma * value_function[next_state])
  179. action_values[action]= value
  180. policy[state]=max(action_values,key=action_values.get)
  181. print("Optimal value function:", value_function)
  182. print("Optimal policy:",policy)
  183.  
  184. -----------------------------------------------------------------------------------------
  185.  
  186. 4 Creating policy table for grid problem using MDP
  187.  
  188. import numpy as np
  189. import random
  190.  
  191. grid_size = 4
  192. goal_size = (3,3)
  193. goal_state = goal_size
  194. actions = ['up','down','left','right']
  195. action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)}
  196.  
  197. #Parameters
  198. theta = 1e-4 #Learning rate
  199. gamma = 0.9 #Discount factor
  200. rewards = np.zeros((grid_size,grid_size))
  201. rewards[goal_state]=10
  202.  
  203. #Helper function
  204. def get_next_state(state,action):
  205. r,c=state
  206. dr,dc=action_map[action]
  207. new_r,new_c=r+dr,c+dc
  208. if 0<=new_r<grid_size and 0<=new_c<grid_size:
  209. return (new_r,new_c)
  210. return state # if out of bounds,stay in the same state
  211.  
  212. value_table = np.zeros((grid_size,grid_size))
  213. value_table
  214.  
  215. while True:
  216. delta=0
  217. new_value_table=np.copy(value_table)
  218. for r in range(grid_size):
  219. for c in range(grid_size):
  220. state=(r,c)
  221. if state==goal_state:
  222. continue # skip goal state
  223.  
  224. # calc the value of the state using the bellmans eqn
  225. state_values=[]
  226. for action in actions:
  227. next_state=get_next_state(state,action)
  228. reward=rewards[next_state]
  229. state_values.append(reward+gamma*value_table[next_state])
  230.  
  231. # Update the value table
  232. new_value_table[state]=max(state_values)
  233. delta=max(delta,abs(new_value_table[state]-value_table[state]))
  234.  
  235. value_table=new_value_table
  236. if delta<theta:
  237. break
  238. print("optimal value table:")
  239. print(value_table)
  240.  
  241. --------------------------------------------------------------------------
  242. 5 Using Q-learning and MDP on grid problem
  243.  
  244. import numpy as np
  245. import random
  246.  
  247. # Environment setup
  248. grid_size = 4
  249. goal_state = (3,3)
  250. actions = ['up', 'down', 'left', 'right']
  251. action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)} # Positional changes due to action
  252. print(action_map)
  253. # Parameters
  254. alpha = 0.1 # learning factor
  255. gamma = 0.9 # discount factor (for future rewards)
  256. epsilon = 0.1 # exploration factor
  257. episodes = 1000 # no of training episodes/iterations
  258. theta = 1e-4 # Convergence threshold for value iteration
  259.  
  260. # Reward and table initialization
  261. rewards = np.zeros((grid_size, grid_size))
  262. rewards[goal_state] = 10
  263. q_table = np.zeros((grid_size, grid_size, len(actions))) # (3D) 16 X 16
  264. value_table = np.zeros((grid_size, grid_size)) # 4 X 4
  265.  
  266. # Helper functions
  267. def get_next_state(state, action):
  268. row, col = state
  269. d_row, d_col = action_map[action]
  270. new_row = row + d_row
  271. new_col = col + d_col
  272. if 0<=new_row<grid_size and 0<=new_col<grid_size:
  273. return (new_row, new_col)
  274. else:
  275. return state # if out of bounds, stay in the same state
  276.  
  277. # Q-learning with Value Iteration Influence
  278. for episode in range(episodes):
  279. state = (0,0) # start state
  280. while state != goal_state:
  281. # choose action (epsilon-greedy)
  282. if random.uniform(0,1) < epsilon:
  283. action = random.choice(actions) # Explore
  284. else:
  285. action = actions[np.argmax(q_table[state[0], state[1]])] # Exploit
  286.  
  287. # Take action
  288. next_state = get_next_state(state, action)
  289. reward = rewards[next_state]
  290.  
  291. # Q-learning update ( Bellman equation influences here)
  292. action_index = actions.index(action)
  293. best_next_q = np.max(q_table[next_state[0], next_state[1]])
  294. q_table[state[0], state[1], action_index] += alpha * (reward + gamma *
  295. best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value
  296.  
  297. # Update Value Table using Bellman Equation
  298. state_values = []
  299. for a in actions:
  300. next_s = get_next_state(state, a)
  301. state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]])) # Bellman Equation using Q-value
  302.  
  303. value_table[state] = max(state_values)
  304.  
  305. # Move to next state
  306. state = next_state
  307.  
  308. # Derive Policy from Q-learning
  309. policy = np.zeros((grid_size, grid_size), dtype=str)
  310. for r in range(grid_size):
  311. for c in range(grid_size):
  312. state = (r, c)
  313. if state == goal_state:
  314. policy[state] = 'G' # Goal
  315. continue
  316.  
  317. action_index = np.argmax(q_table[state[0], state[1]])
  318. policy[state] = actions[action_index][0].upper()
  319.  
  320. print('Q-learning Q-Table:')
  321. print(q_table)
  322.  
  323. print('\n Value Table (Updated with Bellman Influence): ')
  324. print(value_table)
  325.  
  326. print('\n Optimal Policy from Q-learning: ')
  327. for row in policy:
  328. print(' '.join(row))
  329.  
  330. ---------------------------------------------------------------------------------------------
  331. 6.Demonstrate solving 3x3 Grid problem using SARSA Approach
  332.  
  333. import random
  334. ROWS, COLS = 3,3
  335. ACTIONS = ["UP","DOWN","LEFT","RIGHT"]
  336.  
  337. Q = {}
  338. for r in range(ROWS):
  339. for c in range(COLS):
  340. Q[(r,c)] = {a: 0.0 for a in ACTIONS}
  341. print(Q)
  342.  
  343. alpha = 0.1 # learning rate
  344. gamma = 0.9 # discount facor
  345. epsilon = 0.2 # exploration rate
  346. episodes = 100
  347.  
  348. def choose_action(state):
  349. if random.random() < epsilon:
  350. return random.choice(ACTIONS)
  351. return max(Q[state], key=Q[state].get)
  352.  
  353. def step(state,action):
  354. r, c =state
  355.  
  356. if action == "UP" and r> 0:
  357. r -= 1
  358. elif action == "DOWN" and r< ROWS - 1:
  359. r += 1
  360. elif action == "LEFT" and c > 0:
  361. c -= 1
  362. elif action == "RIGHT" and c < COLS - 1:
  363. c += 1
  364.  
  365. next_state = (r,c)
  366.  
  367. if next_state == (0,2):
  368. return next_state,10, True
  369. else:
  370. return next_state, -1, False
  371.  
  372. for ep in range(episodes):
  373. state = (0,0)
  374. action = choose_action(state)
  375. done = False
  376.  
  377. while not done:
  378. next_state, reward , done = step(state, action)
  379.  
  380. if not done:
  381. next_action = choose_action(next_state)
  382. next_q = Q[next_state][next_action]
  383. else:
  384. next_action = None
  385. next_q = 0
  386. Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action])
  387. state = next_state
  388. action = next_action
  389.  
  390. print("Learned Q-values:\n")
  391. for state in Q:
  392. print(state, Q[state])
  393.  
  394. policy = {}
  395. for state in Q:
  396. policy[state] = max(Q[state],key=Q[state].get)
  397. print("\nPolicy Table(BEst action per state):")
  398. for state in sorted(policy):
  399. print(state,"->", policy[state])
  400.  
  401. ---------------------------------------------------------------------------
  402. 7 Demonstration of Monte Carlo Method on Grid Problem
  403.  
  404. import random
  405. from collections import defaultdict
  406.  
  407. num_simulations = 10000
  408. grid_size = (3,3)
  409. Q = defaultdict(lambda:defaultdict(float))
  410.  
  411. returns = defaultdict(lambda:defaultdict(list))
  412. actions = {"down":(1,0),"right":(0,1)}
  413. actions
  414.  
  415. for _ in range(num_simulations):
  416. start_x = random.randint(0, grid_size[0] - 1)
  417. start_y = random.randint(0, grid_size[1] - 1)
  418. current_position = (start_x, start_y)
  419. while current_position == (2, 2):
  420. start_x = random.randint(0, grid_size[0] - 1)
  421. start_y = random.randint(0, grid_size[1] - 1)
  422. current_position = (start_x, start_y)
  423. episode = []
  424. path_length = 0
  425. while current_position != (2, 2):
  426. valid_actions = []
  427. for action, (dx, dy) in actions.items():
  428. new_x,new_y = current_position[0] + dx,current_position[1] + dy
  429. if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]:
  430. valid_actions.append(action)
  431. chosen_action = random.choice(valid_actions)
  432. dx, dy = actions[chosen_action]
  433.  
  434. next_position = current_position[0] + dx,current_position[1] + dy
  435. episode.append((current_position, chosen_action))
  436. current_position = next_position
  437. path_length += 1
  438. G = path_length
  439. for state, action in episode:
  440. if G not in returns[state][action]:
  441. returns[state][action].append(G)
  442.  
  443. Q[state][action] = sum(returns[state][action]) / len(returns[state][action])
  444.  
  445. for state,actions in Q.items():
  446. for action,value in actions.items():
  447. print(f"state: {state},Action: {action}:{value:.2f}")
  448.  
  449. -----------------------------------------------------------------------------------
  450. Practical 11– Demonstrate TD Learning [TD(0)]
  451.  
  452. # Single Epi
  453.  
  454. import numpy as np
  455.  
  456. # Define states and values
  457. states = ["A", "B", "C", "Goal"]
  458. V = {s: 0 for s in states} # Initialize Values
  459. V["Goal"] = 1 # Terminal state
  460.  
  461. # Parameters
  462. alpha = 0.5
  463. gamma = 1
  464.  
  465. # Simulated Episode: A -> B -> C -> Goal
  466. episode = [("A", "B"), ("B", "C"), ("C", "Goal")]
  467. rewards = {("C", "Goal"): 1}
  468.  
  469. # Run TD(0) Updates
  470. for (s, s_next) in episode:
  471. reward = rewards.get((s, s_next), 0)
  472. td_error = reward + gamma * V[s_next] - V[s]
  473. V[s] += alpha * td_error
  474.  
  475. # Print final state values
  476. print("TD(0) Updated Values:", V)
  477.  
  478. # Multi Epi
  479. import numpy as np
  480.  
  481. # Define states and values
  482. states = ["A", "B", "C", "Goal"]
  483. V = {s: 0 for s in states}
  484. V["Goal"] = 1
  485.  
  486. # Parameters
  487. alpha = 0.1
  488. gamma = 0.8
  489. n_episodes = 5
  490.  
  491. # Simulated episode: A -> B -> C -> Goal
  492. transitions = {"A": "B", "B": "C", "C": "Goal"}
  493. rewards = {("C", "Goal"): 1}
  494.  
  495. for episodes in range(n_episodes):
  496. state = "A"
  497.  
  498. while state != "Goal":
  499. next_state = transitions[state]
  500.  
  501. # Run TD(0) Updates
  502. reward = rewards.get((state, next_state), 0)
  503. td_error = reward + gamma * V[next_state] - V[state]
  504. V[state] += alpha * td_error
  505.  
  506. state = next_state
  507.  
  508. print(V)
  509.  
  510. # Print final state values
  511. print("TD(0) Updated Values:", n_episodes, ":", V)