New paste
Recent
API
Trending
Blog
Guest
Sign Up or Login
Login
Sign Up
New Paste
Syntax Highlighting
1.Implementation of Q-Learning with bellman Equation in linear grid environment import numpy as np #define the environment(gridworld) n_states = 5 #Number of states (0,1,2,3,4) actions = [0,1] # Actions: 0=left , 1=right rewards = [-1,-1,-1,-1,10] goal_state =4 #Goal state alpha = 0.1 #Learning rate gamma = 0.9 #Discount Factor epsilon = 0.1 #Exploration probability #Initialize Q-table (n_states x actions) q_table = np.zeros((n_states, len(actions))) q_table n_episodes = 1000 for episode in range(n_episodes): state = 0 #Start state while state != goal_state: #choose action (epsilon-greedy) if np.random.rand() < epsilon: action = np.random.choice(actions) else: action = np.argmax(q_table[state]) #Take action and observe new state and reward new_state = state + 1 if action == 1 else max(0, state-1) reward = rewards[new_state] #Update Q-value 3.06 using the Q-Learning formula q_table[state, action] += alpha * (reward + gamma*np.max(q_table[new_state]) - q_table[state, action]) state = new_state q_table state = 0 path = [state] while state != goal_state: action = np.argmax(q_table[state]) state = state+1 if action==1 else max(0,state-1) path.append(state) print("Policy is",path) ------------------------------------------------------------------------------- 2.Creating Q table and setting optimal path using Q-learning on 4x4 grid import numpy as np import random grid_size = 4 goal_size = (3,3) goal_state = goal_size actions = ['up','down','left','right'] action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)} #Parameters alpha = 0.1 #Learning rate gamma = 0.9 #Discount factor epsilon = 0.1 #Exploration rate episodes = 1000 #Initialize Q-table q_table = np.zeros((grid_size,grid_size,len(actions))) q_table #Helper function def get_next_state(state,action): r,c=state dr,dc=action_map[action] new_r,new_c=r+dr,c+dc if 0<=new_r<grid_size and 0<=new_c<grid_size: return (new_r,new_c) return state # if out of bounds,stay in the same state def get_reward(state): return 10 if state==goal_state else 0 #Q-Learning Algo for episode in range(episodes): state=(0,0) while state != goal_state: if random.uniform(0,1)<epsilon: action=random.choice(actions) # explore else: action=actions[np.argmax(q_table[state[0],state[1]])] # exploit #take action next_state=get_next_state(state,action) reward=get_reward(next_state) #update Q-value action_index=actions.index(action) best_next_q=np.max(q_table[next_state[0],next_state[1]]) q_table[state[0],state[1],action_index] += alpha *( reward+gamma *best_next_q- q_table[state[0],state[1],action_index] # Bellman's Equation ) state=next_state #print Q-table print("Trained Q-table:") print(q_table) # Test the agent state = (0,0) path = [state] while state != goal_state: action = actions[np.argmax(q_table[state[0],state[1]])] state = get_next_state(state,action) path.append(state) print("Optimal path:") print(path) -------------------------------------------------------------------------- 3.Solving Game problem(Coin,enemy) using MDP and Q-Learning Model Based Approach import numpy as np states = ['Start','Coin','Enemy'] actions = ['Go_Coin','Go_Enemy'] transition_rewards = { 'Start':{'Go_Coin':10,'Go_Enemy':-5}, 'Coin':{}, 'Enemy':{} } transition_probs = { 'Start':{'Go_Coin':{'Coin':1.0},'Go_Enemy':{'Enemy':1.0}}, 'Coin':{}, 'Enemy':{} } states,actions transition_rewards transition_probs gamma = 0.9 theta = 0.001 value_function = {state:0 for state in states} value_function while True: delta=0 new_value_function = value_function.copy() for state in states: if state in['Coin','Enemy']: continue state_values = [] for action in actions: value=0 for next_state,prob in transition_probs[state][action].items(): reward = transition_rewards[state][action] print(reward) print(prob) print(value_function[next_state]) value += prob*(reward + gamma * value_function[next_state]) print(value) state_values.append(value) print("s_v",state_values) new_value_function[state]=max(state_values) print("n_v",new_value_function) delta=max(delta,abs(new_value_function[state]-value_function[state])) value_function=new_value_function print("v_f",value_function) if delta<theta: break value_function policy = {} for state in states: if state in ['Coin','Enemy']: policy[state]='T' continue action_values={} for action in actions: value=0 for next_state,prob in transition_probs[state][action].items(): reward = transition_rewards[state][action] value += prob * (reward + gamma * value_function[next_state]) action_values[action]= value policy[state]=max(action_values,key=action_values.get) print("Optimal value function:", value_function) print("Optimal policy:",policy) ----------------------------------------------------------------------------------------- 4 Creating policy table for grid problem using MDP import numpy as np import random grid_size = 4 goal_size = (3,3) goal_state = goal_size actions = ['up','down','left','right'] action_map = {'up':(-1,0),'down':(1,0),'left':(0,-1),'right':(0,1)} #Parameters theta = 1e-4 #Learning rate gamma = 0.9 #Discount factor rewards = np.zeros((grid_size,grid_size)) rewards[goal_state]=10 #Helper function def get_next_state(state,action): r,c=state dr,dc=action_map[action] new_r,new_c=r+dr,c+dc if 0<=new_r<grid_size and 0<=new_c<grid_size: return (new_r,new_c) return state # if out of bounds,stay in the same state value_table = np.zeros((grid_size,grid_size)) value_table while True: delta=0 new_value_table=np.copy(value_table) for r in range(grid_size): for c in range(grid_size): state=(r,c) if state==goal_state: continue # skip goal state # calc the value of the state using the bellmans eqn state_values=[] for action in actions: next_state=get_next_state(state,action) reward=rewards[next_state] state_values.append(reward+gamma*value_table[next_state]) # Update the value table new_value_table[state]=max(state_values) delta=max(delta,abs(new_value_table[state]-value_table[state])) value_table=new_value_table if delta<theta: break print("optimal value table:") print(value_table) -------------------------------------------------------------------------- 5 Using Q-learning and MDP on grid problem import numpy as np import random # Environment setup grid_size = 4 goal_state = (3,3) actions = ['up', 'down', 'left', 'right'] action_map = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)} # Positional changes due to action print(action_map) # Parameters alpha = 0.1 # learning factor gamma = 0.9 # discount factor (for future rewards) epsilon = 0.1 # exploration factor episodes = 1000 # no of training episodes/iterations theta = 1e-4 # Convergence threshold for value iteration # Reward and table initialization rewards = np.zeros((grid_size, grid_size)) rewards[goal_state] = 10 q_table = np.zeros((grid_size, grid_size, len(actions))) # (3D) 16 X 16 value_table = np.zeros((grid_size, grid_size)) # 4 X 4 # Helper functions def get_next_state(state, action): row, col = state d_row, d_col = action_map[action] new_row = row + d_row new_col = col + d_col if 0<=new_row<grid_size and 0<=new_col<grid_size: return (new_row, new_col) else: return state # if out of bounds, stay in the same state # Q-learning with Value Iteration Influence for episode in range(episodes): state = (0,0) # start state while state != goal_state: # choose action (epsilon-greedy) if random.uniform(0,1) < epsilon: action = random.choice(actions) # Explore else: action = actions[np.argmax(q_table[state[0], state[1]])] # Exploit # Take action next_state = get_next_state(state, action) reward = rewards[next_state] # Q-learning update ( Bellman equation influences here) action_index = actions.index(action) best_next_q = np.max(q_table[next_state[0], next_state[1]]) q_table[state[0], state[1], action_index] += alpha * (reward + gamma * best_next_q - q_table[state[0], state[1], action_index]) # Formula for Q-value # Update Value Table using Bellman Equation state_values = [] for a in actions: next_s = get_next_state(state, a) state_values.append(rewards[next_s] + gamma * np.max(q_table[next_s[0], next_s[1]])) # Bellman Equation using Q-value value_table[state] = max(state_values) # Move to next state state = next_state # Derive Policy from Q-learning policy = np.zeros((grid_size, grid_size), dtype=str) for r in range(grid_size): for c in range(grid_size): state = (r, c) if state == goal_state: policy[state] = 'G' # Goal continue action_index = np.argmax(q_table[state[0], state[1]]) policy[state] = actions[action_index][0].upper() print('Q-learning Q-Table:') print(q_table) print('\n Value Table (Updated with Bellman Influence): ') print(value_table) print('\n Optimal Policy from Q-learning: ') for row in policy: print(' '.join(row)) --------------------------------------------------------------------------------------------- 6.Demonstrate solving 3x3 Grid problem using SARSA Approach import random ROWS, COLS = 3,3 ACTIONS = ["UP","DOWN","LEFT","RIGHT"] Q = {} for r in range(ROWS): for c in range(COLS): Q[(r,c)] = {a: 0.0 for a in ACTIONS} print(Q) alpha = 0.1 # learning rate gamma = 0.9 # discount facor epsilon = 0.2 # exploration rate episodes = 100 def choose_action(state): if random.random() < epsilon: return random.choice(ACTIONS) return max(Q[state], key=Q[state].get) def step(state,action): r, c =state if action == "UP" and r> 0: r -= 1 elif action == "DOWN" and r< ROWS - 1: r += 1 elif action == "LEFT" and c > 0: c -= 1 elif action == "RIGHT" and c < COLS - 1: c += 1 next_state = (r,c) if next_state == (0,2): return next_state,10, True else: return next_state, -1, False for ep in range(episodes): state = (0,0) action = choose_action(state) done = False while not done: next_state, reward , done = step(state, action) if not done: next_action = choose_action(next_state) next_q = Q[next_state][next_action] else: next_action = None next_q = 0 Q[state][action] += alpha * (reward + gamma * next_q - Q[state][action]) state = next_state action = next_action print("Learned Q-values:\n") for state in Q: print(state, Q[state]) policy = {} for state in Q: policy[state] = max(Q[state],key=Q[state].get) print("\nPolicy Table(BEst action per state):") for state in sorted(policy): print(state,"->", policy[state]) --------------------------------------------------------------------------- 7 Demonstration of Monte Carlo Method on Grid Problem import random from collections import defaultdict num_simulations = 10000 grid_size = (3,3) Q = defaultdict(lambda:defaultdict(float)) returns = defaultdict(lambda:defaultdict(list)) actions = {"down":(1,0),"right":(0,1)} actions for _ in range(num_simulations): start_x = random.randint(0, grid_size[0] - 1) start_y = random.randint(0, grid_size[1] - 1) current_position = (start_x, start_y) while current_position == (2, 2): start_x = random.randint(0, grid_size[0] - 1) start_y = random.randint(0, grid_size[1] - 1) current_position = (start_x, start_y) episode = [] path_length = 0 while current_position != (2, 2): valid_actions = [] for action, (dx, dy) in actions.items(): new_x,new_y = current_position[0] + dx,current_position[1] + dy if 0 <= new_x < grid_size[0] and 0 <= new_y < grid_size[1]: valid_actions.append(action) chosen_action = random.choice(valid_actions) dx, dy = actions[chosen_action] next_position = current_position[0] + dx,current_position[1] + dy episode.append((current_position, chosen_action)) current_position = next_position path_length += 1 G = path_length for state, action in episode: if G not in returns[state][action]: returns[state][action].append(G) Q[state][action] = sum(returns[state][action]) / len(returns[state][action]) for state,actions in Q.items(): for action,value in actions.items(): print(f"state: {state},Action: {action}:{value:.2f}") ----------------------------------------------------------------------------------- Practical 11– Demonstrate TD Learning [TD(0)] # Single Epi import numpy as np # Define states and values states = ["A", "B", "C", "Goal"] V = {s: 0 for s in states} # Initialize Values V["Goal"] = 1 # Terminal state # Parameters alpha = 0.5 gamma = 1 # Simulated Episode: A -> B -> C -> Goal episode = [("A", "B"), ("B", "C"), ("C", "Goal")] rewards = {("C", "Goal"): 1} # Run TD(0) Updates for (s, s_next) in episode: reward = rewards.get((s, s_next), 0) td_error = reward + gamma * V[s_next] - V[s] V[s] += alpha * td_error # Print final state values print("TD(0) Updated Values:", V) # Multi Epi import numpy as np # Define states and values states = ["A", "B", "C", "Goal"] V = {s: 0 for s in states} V["Goal"] = 1 # Parameters alpha = 0.1 gamma = 0.8 n_episodes = 5 # Simulated episode: A -> B -> C -> Goal transitions = {"A": "B", "B": "C", "C": "Goal"} rewards = {("C", "Goal"): 1} for episodes in range(n_episodes): state = "A" while state != "Goal": next_state = transitions[state] # Run TD(0) Updates reward = rewards.get((state, next_state), 0) td_error = reward + gamma * V[next_state] - V[state] V[state] += alpha * td_error state = next_state print(V) # Print final state values print("TD(0) Updated Values:", n_episodes, ":", V)
Optional Gist Settings
Gist Name/Title:
Category:
None
Cryptocurrency
Cybersecurity
Fixit
Food
Gaming
Haiku
Help
History
Housing
Jokes
Legal
Money
Movies
Music
Pets
Photo
Science
Software
Source Code
Spirit
Sports
Travel
TV
Writing
Syntax Highlighting:
None
Bash
C
C++
C#
CSS
HTML
Java
JavaScript
Lua
Objective C
Perl
PHP
Python
Ruby
JSON
Swift
Markdown
ActionScript
Ada
Apache Log
AppleScript
ASM (NASM)
ASP
Bash
C
C for Macs
CAD DCL
CAD Lisp
C++
C#
ColdFusion
CSS
D
Delphi
Diff
Batch
Eiffel
Fortran
FreeBasic
Game Maker
HTML
INI file
Java
JavaScript
Lisp
Lua
MPASM
MySQL
NullSoft Installer
Objective C
OCaml
Openoffice BASIC
Oracle 8
Pascal
Perl
PHP
Python
QBasic
Robots
Ruby
Scheme
Smarty
SQL
VisualBasic
VB.NET
VisualFoxPro
XML
AutoIt
Blitz Basic
BNF
Erlang
Genero
Groovy
Haskell
Inno Script
Latex
Linden Scripting
MatLab
M68000 Assembler
mIRC
Rails
PL/SQL
Smalltalk
TCL
Z80 Assembler
ABAP
ActionScript 3
APT Sources
Avisynth
Basic4GL
BibTeX
BrainFuck
BOO
CFDG
C Intermediate Language
CMake
COBOL
DCS
DIV
DOT
Email
FO Language
GetText
OpenGL Shading
Ruby Gnuplot
HQ9 Plus
IDL
INTERCAL
IO
Java 5
KiXtart
Clone C
Clone C++
Loco Basic
LOL Code
Lotus Formulas
Lotus Script
LScript
Make
Modula 3
MXML
Oberon 2
OCaml Brief
Oracle 11
Per
PHP Brief
Pic 16
Pixel Bender
POV-Ray
PowerShell
Progress
Prolog
Properties
ProvideX
REBOL
REG
SAS
Scala
Scilab
SdlBasic
Tera Term
thinBasic
T-SQL
TypoScript
VeriLog
VHDL
VIM
Visual Pro Log
WhiteSpace
WHOIS
Winbatch
Xorg Config
XPP
Pawn
4CS
6502 ACME Cross Assembler
6502 Kick Assembler
6502 TASM/64TASS
Motorola 68000 HiSoft Dev
ALGOL 68
autoconf
Autohotkey
Awk
Cuesheet
ChaiScript
Clojure
C++ (with Qt extensions)
E
ECMAScript
Formula One
F#
GAMBAS
GDB
Genie
Go
GwBasic
HicEst
Icon
J
jQuery
Liberty BASIC
Logtalk
MagikSF
MapBasic
MIX Assembler
Modula 2
newLISP
Objeck Programming Language
Oz
Delphi Prism (Oxygene)
Oz
PCRE
Perl 6
OpenBSD PACKET FILTER
Pike
PostgreSQL
PowerBuilder
PureBasic
q/kdb+
RPM Spec
R
SystemVerilog
Vala
Unicon
Vala
XBasic
ZXBasic
UnrealScript
HTML 5
ProFTPd
BASCOM AVR
C: Loadrunner
CoffeeScript
EPC
Falcon
LLVM
PyCon
YAML
FreeSWITCH
ARM
Asymptote
DCL
DCPU-16
Haxe
LDIF
Nagios
Octave
ParaSail
PARI/GP
Python for S60
Rexx
SPARK
SPARQL
StoneScript
UPC
Urbi
Vedit
AIMMS
Chapel
Dart
Easytrieve
ISPF Panel Definition
JCL
Nginx
Nim
PostScript
QML
Racket
RBScript
Rust
SCL
StandardML
VBScript
C (WinAPI)
C++ (WinAPI)
NetRexx
JSON
Swift
SuperCollider
Julia
Blitz3D
BlitzMax
SQF
Puppet
Filemaker
Euphoria
PL/I
Open Object Rexx
Markdown
Kotlin
Ceylon
Arduino
YARA
TypeScript
Mercury
MetaPost
MK-61/52
Phix
Roff Manpage
SSH Config
TeXgraph
Xojo
KSP (Kontakt Script)
GDScript
Godot GLSL
None
Tags:
Gist Exposure:
Public
Unlisted
Private
Gist Expiration:
Never
Burn after read
10 Minutes
1 Hour
1 Day
1 Week
2 Weeks
1 Month
6 Months
1 Year
Password
Enabled
Disabled
Folder:
(members only)
Burn after read
Create New Gist
You are currently not logged in, this means you can not edit or delete anything you paste.
Sign Up
or
Login
Public Gists
Untitled
None | 23 minutes ago | 3 Views
******** TRANSFER BUG PAYPAL TRF CASHAPP TRF SKRILL TRF
None | 6 hours ago | 15 Views
cater
None | 8 hours ago | 13 Views
Cp mega
None | 8 hours ago | 15 Views
Untitled
None | 23 hours ago | 23 Views
Untitled
None | 1 day ago | 27 Views
sleep
None | 1 day ago | 32 Views
Not a member of GistPad yet?
Sign Up
, it unlocks many cool features!
We use cookies for various purposes including analytics. By continuing to use GistPad, you agree to our use of cookies as described in the
Privacy Policy
.
OK, I Understand