
Q-Learning完成迷宫任务
项目地址:Q-Learning完成迷宫任务 - CAWCAW的意思是鸦叫声,Guaik的Logo是一只长相奇特的乌鸦。https://caw.guaik.io/d/23-q-learning
# 导入使用的包 import numpy as np import matplotlib.pyplot as plt from matplotlib import animation from IPython.display import HTML import matplotlib.cm as cm %matplotlib inline # 迷宫的初始位置 # 声明图的大小以及图的变量名 fig = plt.figure(figsize=(6, 6)) ax = plt.gca() # 画出红色的墙壁 plt.plot([1, 1], [4, 3], color='red', linewidth=2) plt.plot([1, 2], [3, 3], color='red', linewidth=2) plt.plot([2, 2], [3, 2], color='red', linewidth=2) plt.plot([1, 2], [2, 2], color='red', linewidth=2) plt.plot([1, 1], [2, 1], color='red', linewidth=2) plt.plot([1, 2], [5, 5], color='red', linewidth=2) plt.plot([2, 2], [5, 4], color='red', linewidth=2) plt.plot([2, 3], [4, 4], color='red', linewidth=2) plt.plot([3, 3], [4, 3], color='red', linewidth=2) plt.plot([3, 4], [3, 3], color='red', linewidth=2) plt.plot([2, 2], [1, 0], color='red', linewidth=2) plt.plot([2, 3], [1, 1], color='red', linewidth=2) plt.plot([3, 3], [1, 2], color='red', linewidth=2) plt.plot([3, 4], [2, 2], color='red', linewidth=2) plt.plot([4, 4], [2, 1], color='red', linewidth=2) plt.plot([3, 3], [6, 5], color='red', linewidth=2) plt.plot([4, 4], [6, 4], color='red', linewidth=2) plt.plot([5, 5], [5, 4], color='red', linewidth=2) plt.plot([5, 6], [5, 5], color='red', linewidth=2) plt.plot([5, 6], [3, 3], color='red', linewidth=2) plt.plot([5, 6], [2, 2], color='red', linewidth=2) plt.plot([5, 6], [1, 1], color='red', linewidth=2) # 画出表示状态的文字S0-S35 s = 0 for i in range(0, 6): for j in range(0, 6): plt.text(0.5 + j, 5.5 - i, 'S'+str(s), size=14, ha='center') s += 1 plt.text(0.5, 5.3, 'START', ha='center') plt.text(5.5, 0.3, 'GOAL', ha='center') # 设置画图的范围 ax.set_xlim(0, 6) ax.set_ylim(0, 6) plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off', right='off', left='off', labelleft='off') # 当前位置S0用绿色圆圈画出 line, = ax.plot([0.5], [5.5], marker='o', color='g', markersize=40) # 设定参数θ的初始值theta_0,用于确定初始方案 # 行为状态0-34,列为上右下左的移动方向 theta_0 = np.array([ [np.nan, 1, 1, np.nan], # S0 [np.nan, 1, np.nan, 1], # S1 [np.nan, np.nan, 1, 1], # S2 [np.nan, np.nan, 1, np.nan], # S3 [np.nan, 1, 1, np.nan], # S4 [np.nan, np.nan, np.nan, 1], # S5 [1, 1, 1, np.nan], # S6 [np.nan, np.nan, 1, 1], # S7 [1, 1, np.nan, np.nan], # S8 [1, np.nan, 1, 1], # S9 [1, np.nan, 1, np.nan], # S10 [np.nan, np.nan, 1, np.nan], # S11 [1, np.nan, 1, np.nan], # S12 [1, 1, np.nan, np.nan], # S13 [np.nan, np.nan, 1, 1], # S14 [1, 1, np.nan, np.nan], # S15 [1, 1, 1, 1], # S16 [1, np.nan, np.nan, 1], # S17 [1, 1, 1, np.nan], # S18 [np.nan, np.nan, np.nan, 1], # S19 [1, 1, 1, np.nan], # S20 [np.nan, 1, np.nan, 1], # S21 [1, 1, 1, 1], # S22 [np.nan, np.nan, np.nan, 1], # S23 [1, np.nan, 1, np.nan], # S24 [np.nan, 1, 1, np.nan], # S25 [1, np.nan, np.nan, 1], # S26 [np.nan, np.nan, 1, np.nan], # S27 [1, 1, 1, np.nan], # S28 [np.nan, np.nan, np.nan, 1], # S29 [1, 1, np.nan, np.nan], # S30 [1, np.nan, np.nan, 1], # S31 [np.nan, 1, np.nan, np.nan], # S32 [1, 1, np.nan, 1], # S33 [1, 1, np.nan, 1], # S34 # [1, np.nan, np.nan, np.nan], # S35 ]) # 设置初始的动作价值函数 [a, b] = theta_0.shape Q = np.random.rand(a, b) * theta_0 * 0.1 # 乘theta_0是为了使墙壁方向的值为nan # 将策略参数theta_0转化为随机策略 def simple_convert_into_pi_from_theta(theta): ''' 简单计算比率 ''' [m, n] = theta.shape pi = np.zeros((m, n)) for i in range(0, m): pi[i, :] = theta[i, :] / np.nansum(theta[i, :]) pi = np.nan_to_num(pi) return pi # 求取随机系统策略pi_0 pi_0 = simple_convert_into_pi_from_theta(theta_0) # 实现贪婪算法 def get_action(s, Q, epsilon, pi_0): direction = ['up', 'right', 'down', 'left'] # 确定行动 if np.random.rand() < epsilon: # 以概率随机行动 next_direction = np.random.choice(direction, p=pi_0[s, :]) else: # 采用q的最大值对应的动作 next_direction = direction[np.nanargmax(Q[s, :])] # 为动作加上索引 if next_direction == 'up': action = 0 elif next_direction == 'right': action = 1 elif next_direction == 'down': action = 2 elif next_direction == 'left': action = 3 return action def get_s_next(s, a, Q, epsilon, pi_0): direction = ['up', 'right', 'down', 'left'] next_direction = direction[a] # 通过动作确定下一个状态 if next_direction == 'up': s_next = s - 6 elif next_direction == 'right': s_next = s + 1 elif next_direction == 'down': s_next = s + 6 elif next_direction == 'left': s_next = s - 1 return s_next # 基于Q学习的更新动作价值函数Q def Q_learning(s, a, r, s_next, Q, eta, gamma): if s_next == 35: Q[s, a] = Q[s, a] + eta * (r - Q[s, a]) else: Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next, :]) - Q[s, a]) return Q # 定义基于Q-Learning求解迷宫的函数,输出状态、动作的历史记录以及更新后的Q def goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi): s = 0 a = a_next = get_action(s, Q, epsilon, pi) # 初始动作 s_a_history = [[0, np.nan]] # 记录智能体移动序列 while (1): a = a_next # 更新动作 # 将动作放在现在的状态下 s_a_history[-1][1] = a # 获取有效的下一个状态 s_next = get_s_next(s, a, Q, epsilon, pi) # 代入下一个状态,动作未知时为nan s_a_history.append([s_next, np.nan]) if s_next == 35: r = 1 a_next = np.nan else: r = 0 a_next = get_action(s_next, Q, epsilon, pi) Q = Q_learning(s, a, r, s_next, Q, eta, gamma) if s_next == 35: break else: s = s_next return [s_a_history, Q] eta = 0.1 # 学习率 gamma = 0.9 # 时间折扣率 epsilon = 0.5 # 贪婪算法初始值 v = np.nanmax(Q, axis=1) # 求每个状态价值的最大值 is_continue = True episode = 1 V = [] # 存放每回合的状态价值 V.append(np.nanmax(Q, axis=1)) while is_continue: print('回合数:'+str(episode)) # 贪婪算法的值逐渐减少 epsilon = epsilon / 2 # 通过Sarsa求解迷宫问题,求取移动历史和更新后的Q值 [s_a_history, Q] = goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi_0) new_v = np.nanmax(Q, axis=1) print(np.sum(np.abs(new_v - v))) v = new_v V.append(v) print('求解迷宫问题所需步数 '+str(len(s_a_history) - 1)) # 重复100回合 episode = episode + 1 if episode > 100: break print(Q) # 制作走迷宫动画 def init(): line.set_data([], []) return (line,) def animate(i): ''' 每一帧画面内容 ''' state = s_a_history[i][0] x = (state % 6) + 0.5 y = 5.5 - int(state / 6) line.set_data(x, y) return (line,) anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=100, repeat=False) HTML(anim.to_jshtml()) # 制作Q-Learning行动价值训练动画 # def init(): # line.set_data([], []) # return (line,) # def animate(i): # for n in range(0, 6): # for j in range(0, 6): # if n == 5 and j == 5: # line, = ax.plot([0.5 + j], [5.5 - n], marker="s", color=cm.jet(1.0), markersize=40) # else: # line, = ax.plot([0.5 + j], [5.5 - n], marker="s", color=cm.jet(V[i][n*6+j]), markersize=40) # return (line,) # anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(V), interval=100, repeat=False) # HTML(anim.to_jshtml())
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)