别再死记硬背DQN公式了!用Python+PyTorch手搓一个玩《贪吃蛇》的智能体,从零理解经验回放与目标网络
用PythonPyTorch手搓DQN玩《贪吃蛇》从零理解经验回放与目标网络在强化学习领域Deep Q-NetworkDQN堪称是里程碑式的突破。但很多初学者面对公式推导时容易陷入一看就会一写就废的困境。本文将带您用PyTorch从零实现一个能玩《贪吃蛇》的DQN智能体通过直观的游戏场景理解经验回放和目标网络两大核心技术。1. 环境搭建与游戏逻辑首先我们需要构建《贪吃蛇》的游戏环境。与OpenAI Gym不同这里我们选择手动实现游戏逻辑这能帮助更深入理解环境与智能体的交互机制。import pygame import numpy as np from collections import deque import random class SnakeGame: def __init__(self, width400, height400, block_size20): self.width width self.height height self.block_size block_size self.reset() def reset(self): self.snake deque([[self.width//2, self.height//2]]) self.direction RIGHT self.food self._place_food() self.score 0 self.frame_iteration 0 return self._get_state() def _place_food(self): while True: food [ random.randrange(0, self.width, self.block_size), random.randrange(0, self.height, self.block_size) ] if food not in self.snake: return food def _get_state(self): head self.snake[0] # 相对位置特征 point_l [head[0] - self.block_size, head[1]] point_r [head[0] self.block_size, head[1]] point_u [head[0], head[1] - self.block_size] point_d [head[0], head[1] self.block_size] # 危险信号 danger_straight (self.direction RIGHT and self._is_collision(point_r)) or \ (self.direction LEFT and self._is_collision(point_l)) or \ (self.direction UP and self._is_collision(point_u)) or \ (self.direction DOWN and self._is_collision(point_d)) danger_right (self.direction RIGHT and self._is_collision(point_d)) or \ (self.direction LEFT and self._is_collision(point_u)) or \ (self.direction UP and self._is_collision(point_r)) or \ (self.direction DOWN and self._is_collision(point_l)) danger_left (self.direction RIGHT and self._is_collision(point_u)) or \ (self.direction LEFT and self._is_collision(point_d)) or \ (self.direction UP and self._is_collision(point_l)) or \ (self.direction DOWN and self._is_collision(point_r)) # 食物位置 food_dir_x 1 if self.food[0] head[0] else (-1 if self.food[0] head[0] else 0) food_dir_y 1 if self.food[1] head[1] else (-1 if self.food[1] head[1] else 0) return np.array([ danger_straight, danger_right, danger_left, self.direction LEFT, self.direction RIGHT, self.direction UP, self.direction DOWN, food_dir_x, food_dir_y ], dtypeint) def _is_collision(self, point): return (point[0] 0 or point[0] self.width or point[1] 0 or point[1] self.height or point in list(self.snake)[:-1]) def step(self, action): self.frame_iteration 1 # 1. 移动蛇 if action 0: # 直行 new_dir self.direction elif action 1: # 右转 new_dir {RIGHT:DOWN, DOWN:LEFT, LEFT:UP, UP:RIGHT}[self.direction] else: # 左转 new_dir {RIGHT:UP, UP:LEFT, LEFT:DOWN, DOWN:RIGHT}[self.direction] self.direction new_dir head self.snake[0].copy() if self.direction RIGHT: head[0] self.block_size elif self.direction LEFT: head[0] - self.block_size elif self.direction UP: head[1] - self.block_size elif self.direction DOWN: head[1] self.block_size # 2. 检查游戏结束条件 reward 0 game_over False if self._is_collision(head) or self.frame_iteration 100*len(self.snake): game_over True reward -10 return self._get_state(), reward, game_over, self.score # 3. 放置新食物或移动蛇 self.snake.appendleft(head) if head self.food: self.score 1 reward 10 self.food self._place_food() else: self.snake.pop() return self._get_state(), reward, game_over, self.score这个实现有几个关键设计点状态表示包含8个布尔特征危险信号、当前方向和2个整数特征食物相对位置动作空间3个动作直行、右转、左转奖励设计吃到食物10碰撞-10其他情况0终止条件撞墙/撞自己或超过100倍蛇长的步数2. DQN模型构建接下来我们实现DQN的核心组件。与原始Q-learning不同DQN使用神经网络来近似Q函数这使其能够处理高维状态空间。import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import random from collections import deque class DQN(nn.Module): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.fc1 nn.Linear(input_size, hidden_size) self.fc2 nn.Linear(hidden_size, hidden_size) self.fc3 nn.Linear(hidden_size, output_size) def forward(self, x): x F.relu(self.fc1(x)) x F.relu(self.fc2(x)) x self.fc3(x) return x class DQNAgent: def __init__(self, state_size, action_size): self.state_size state_size self.action_size action_size self.memory deque(maxlen10000) self.gamma 0.95 # 折扣因子 self.epsilon 1.0 # 探索率 self.epsilon_min 0.01 self.epsilon_decay 0.995 self.learning_rate 0.001 self.batch_size 64 self.update_every 5 # 更新目标网络的频率 # 主网络和目标网络 self.model DQN(state_size, 64, action_size) self.target_model DQN(state_size, 64, action_size) self.target_model.load_state_dict(self.model.state_dict()) self.optimizer optim.Adam(self.model.parameters(), lrself.learning_rate) self.t_step 0 def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state): if random.random() self.epsilon: return random.randrange(self.action_size) state torch.FloatTensor(state).unsqueeze(0) with torch.no_grad(): action_values self.model(state) return torch.argmax(action_values).item() def learn(self): if len(self.memory) self.batch_size: return # 从经验池中随机采样 minibatch random.sample(self.memory, self.batch_size) states torch.FloatTensor([t[0] for t in minibatch]) actions torch.LongTensor([t[1] for t in minibatch]) rewards torch.FloatTensor([t[2] for t in minibatch]) next_states torch.FloatTensor([t[3] for t in minibatch]) dones torch.FloatTensor([t[4] for t in minibatch]) # 计算当前Q值和目标Q值 current_q self.model(states).gather(1, actions.unsqueeze(1)) next_q self.target_model(next_states).max(1)[0].detach() target_q rewards (1 - dones) * self.gamma * next_q # 计算损失并更新 loss F.mse_loss(current_q.squeeze(), target_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # 更新探索率 self.epsilon max(self.epsilon_min, self.epsilon * self.epsilon_decay) # 定期更新目标网络 self.t_step (self.t_step 1) % self.update_every if self.t_step 0: self.target_model.load_state_dict(self.model.state_dict())这个实现包含DQN的两个关键技术经验回放Experience Replay将智能体的经验状态、动作、奖励等存储在回放缓冲区训练时随机采样小批量经验打破数据相关性提高数据效率使学习更稳定目标网络Target Network使用单独的目标网络计算目标Q值定期而非每次更新目标网络参数减少目标值波动稳定训练过程3. 训练流程与可视化现在我们将上述组件整合实现完整的训练流程。为了直观理解DQN的学习过程我们还会添加可视化功能。import matplotlib.pyplot as plt from IPython import display def train_dqn(episodes1000): env SnakeGame() agent DQNAgent(state_sizelen(env._get_state()), action_size3) scores [] mean_scores [] total_steps 0 for e in range(episodes): state env.reset() score 0 while True: # 选择并执行动作 action agent.act(state) next_state, reward, done, current_score env.step(action) # 存储经验并学习 agent.remember(state, action, reward, next_state, done) agent.learn() state next_state score current_score total_steps 1 if done: break scores.append(score) mean_score np.mean(scores[-100:]) mean_scores.append(mean_score) # 每100轮输出训练进度 if e % 100 0: print(fEpisode: {e}, Score: {score}, Avg Score: {mean_score:.2f}, Epsilon: {agent.epsilon:.2f}) plot_training(scores, mean_scores) return scores, mean_scores def plot_training(scores, mean_scores): display.clear_output(waitTrue) plt.figure(figsize(10,5)) plt.plot(scores, alpha0.5, labelScore) plt.plot(mean_scores, labelAvg Score) plt.title(DQN Training Progress) plt.xlabel(Episode) plt.ylabel(Score) plt.legend() plt.grid(True) plt.show() # 开始训练 scores, mean_scores train_dqn(episodes500)训练过程中有几个关键观察点探索-利用平衡初始阶段epsilon较高探索为主随着训练进行逐渐降低更多利用学到的策略分数波动初期分数波动大随着学习稳定平均分数逐渐上升学习曲线100轮平均分数是判断学习效果的关键指标4. 高级技巧与优化基础DQN实现后我们可以通过以下技巧进一步提升性能4.1 双重DQNDouble DQN原始DQN存在Q值高估问题。双重DQN通过解耦动作选择和目标值计算来缓解这个问题# 修改learn方法中的目标Q值计算 next_actions self.model(next_states).max(1)[1].unsqueeze(1) next_q self.target_model(next_states).gather(1, next_actions).squeeze() target_q rewards (1 - dones) * self.gamma * next_q4.2 优先级经验回放不是均匀采样而是根据TD误差的绝对值赋予经验不同的采样概率# 需要修改记忆存储和采样逻辑 class PrioritizedReplayBuffer: def __init__(self, capacity, alpha0.6): self.capacity capacity self.alpha alpha self.buffer [] self.priorities np.zeros(capacity) self.pos 0 def add(self, experience, error): max_prio self.priorities.max() if self.buffer else 1.0 if len(self.buffer) self.capacity: self.buffer.append(experience) else: self.buffer[self.pos] experience self.priorities[self.pos] (error 1e-5) ** self.alpha self.pos (self.pos 1) % self.capacity def sample(self, batch_size, beta0.4): if len(self.buffer) self.capacity: prios self.priorities else: prios self.priorities[:self.pos] probs prios / prios.sum() indices np.random.choice(len(self.buffer), batch_size, pprobs) samples [self.buffer[idx] for idx in indices] # 计算重要性采样权重 total len(self.buffer) weights (total * probs[indices]) ** (-beta) weights / weights.max() return samples, indices, np.array(weights, dtypenp.float32)4.3 卷积神经网络处理图像输入如果使用原始像素作为输入可以用CNN替代全连接网络class DQN_CNN(nn.Module): def __init__(self, h, w, outputs): super().__init__() self.conv1 nn.Conv2d(3, 16, kernel_size5, stride2) self.bn1 nn.BatchNorm2d(16) self.conv2 nn.Conv2d(16, 32, kernel_size5, stride2) self.bn2 nn.BatchNorm2d(32) self.conv3 nn.Conv2d(32, 32, kernel_size5, stride2) self.bn3 nn.BatchNorm2d(32) # 计算线性层输入尺寸 def conv2d_size_out(size, kernel_size5, stride2): return (size - (kernel_size - 1) - 1) // stride 1 convw conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) convh conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) linear_input_size convw * convh * 32 self.head nn.Linear(linear_input_size, outputs) def forward(self, x): x F.relu(self.bn1(self.conv1(x))) x F.relu(self.bn2(self.conv2(x))) x F.relu(self.bn3(self.conv3(x))) return self.head(x.view(x.size(0), -1))4.4 超参数调优指南DQN对超参数敏感以下是调优建议超参数推荐范围影响说明学习率1e-4 ~ 1e-3太大导致不稳定太小学习慢折扣因子γ0.9 ~ 0.99未来奖励的重要性回放缓冲区大小1e4 ~ 1e6影响经验多样性批次大小32 ~ 256太小噪声大太大计算开销大目标网络更新频率100 ~ 10000步更新太频繁导致不稳定ε初始值1.0初始探索率ε衰减率0.99 ~ 0.999控制探索衰减速度ε最小值0.01 ~ 0.1保持最小探索在实际项目中我发现以下几个技巧特别实用使用学习率预热前1000步使用较小学习率再逐步增大梯度裁剪防止梯度爆炸torch.nn.utils.clip_grad_norm_(model.parameters(), 10)帧堆叠将连续4帧作为输入提供时序信息奖励塑形设计中间奖励引导学习如朝向食物的奖励