Created
May 20, 2026 02:37
-
-
Save LessUp/877cb35de78a8991d07ec2d1ecbfe339 to your computer and use it in GitHub Desktop.
cnn_train.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import cv2 | |
| import torch | |
| from torch import nn, optim | |
| from torch.utils.data import DataLoader | |
| from torchvision import transforms, datasets | |
| transform = transforms.Compose([ | |
| transforms.RandomRotation(10), # 随机旋转±10度 | |
| transforms.RandomAffine(0, translate=(0.1, 0.1)), # 随机平移10% | |
| transforms.ToTensor(), | |
| transforms.Normalize((0.1307,), (0.3081,)) | |
| ]) | |
| # 测试集不增强(保持原样) | |
| test_transform = transforms.Compose([ | |
| transforms.ToTensor(), | |
| transforms.Normalize((0.1307,), (0.3081,)) | |
| ]) | |
| # 这里需要提前下载MNIST的训练数据集到本地 | |
| train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform) | |
| test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=test_transform) | |
| batch_size = 64 # DataLoader的批次大小 | |
| learning_rate = 0.001 # 优化器学习率 | |
| epochs = 50 | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |
| test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) | |
| criterion = nn.CrossEntropyLoss() | |
| print('模型结构:',nn.Conv2d(1, 32, kernel_size=3, padding=1)) | |
| # 这段代码创建一个二维卷积层: | |
| # - 输入通道数:1(灰度图像) | |
| # - 输出通道数:32(提取32种特征) | |
| # - 卷积核大小:3×3 | |
| # - 填充:padding=1,保持输入输出尺寸相同 | |
| # - 功能:在输入图像上滑动3×3卷积核,提取局部特征,生成32个特征图 | |
| # | |
| # 这是CNN的第一层卷积,用于从原始图像中提取基础视觉特征。 | |
| print("池化层:",nn.MaxPool2d(2, 2)) | |
| # 这段代码创建一个二维最大池化层: | |
| # - 窗口大小:2×2 | |
| # - 步幅:2×2 | |
| # - 功能:在输入特征图上滑动2×2窗口,取每个窗口内的最大值 | |
| # - 效果:将输入尺寸减半(如28×28→14×14),保留最显著特征 | |
| # - 作用:降低计算复杂度,增强特征不变性,防止过拟合 | |
| # | |
| # 最大池化保留了最重要的特征响应,是CNN中的关键组件。 | |
| # 自定义图片预处理函数 | |
| def preprocess_custom_image(image_path): | |
| """ | |
| 把自定义手写数字图片转换成MNIST格式: | |
| 1. 转灰度 → 2. 调整尺寸28×28 → 3. 反转颜色(白底黑字) → 4. 归一化 → 5. 转张量 | |
| """ | |
| # 1. 读取图片(BGR格式),转灰度图 | |
| img = cv2.imread(image_path) | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # 2. 调整尺寸为28×28(关键!MNIST是28×28) | |
| # 先缩放至28×28,再保证比例(避免拉伸) | |
| gray = cv2.resize(gray, (28, 28), interpolation=cv2.INTER_AREA) | |
| # 3. 反转颜色:MNIST是白底黑字,若你的图片是黑底白字,必须反转 | |
| # 用阈值二值化(把灰色变成纯黑/纯白,减少杂色) | |
| _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV) # 反转:黑→白,白→黑 | |
| # 4. 归一化(和MNIST训练集的均值/标准差一致,关键!) | |
| # 先把像素值从0-255转成0-1 | |
| thresh = thresh / 255.0 | |
| # 归一化:(x - 均值) / 标准差 | |
| mean = 0.1307 | |
| std = 0.3081 | |
| thresh = (thresh - mean) / std | |
| # 5. 转成PyTorch张量,调整形状(MNIST是1×28×28:通道×高×宽) | |
| # 先转成numpy数组,再转张量 | |
| tensor_img = torch.tensor(thresh, dtype=torch.float32) | |
| # 增加通道维度(28×28 → 1×28×28) | |
| tensor_img = tensor_img.unsqueeze(0) | |
| # 增加批次维度(1×28×28 → 1×1×28×28),适配模型输入 | |
| tensor_img = tensor_img.unsqueeze(0) | |
| return tensor_img, thresh # 返回张量(模型用)和处理后的图片(可视化用) | |
| # 1. 重新定义模型(和训练时一致) | |
| class MNISTCNNModel(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| # 卷积层:捕捉图像特征 | |
| self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1) # 1→32通道 | |
| self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1) # 32→64通道 | |
| # 池化层:缩小尺寸 | |
| self.pool = nn.MaxPool2d(2, 2) | |
| # 全连接层:分类 | |
| self.fc1 = nn.Linear(64 * 7 * 7, 128) # 28/2/2=7 | |
| # 这个设计是因为: | |
| # | |
| # 1. **输入维度64×7×7的来源**: | |
| # - 经过前面的卷积和池化操作后,特征图尺寸变为 64个通道 × 7×7空间尺寸 | |
| # - CNN结构:28×28 → 经过卷积和池化逐步缩小 → 7×7 | |
| # - 64是最后一层卷积的输出通道数 | |
| # | |
| # 2. **输出维度128**: | |
| # - 设计者选择的隐藏层大小 | |
| # - 在特征压缩和信息保留之间取得平衡 | |
| # - 足够大以保留有用信息,又不会太大导致过拟合 | |
| # | |
| # 3. **展平操作**: | |
| # - 在到达这个线性层前,64×7×7的特征图被展平成单一的64×7×7=3136维向量 | |
| # - `x = x.view(-1, 64 * 7 * 7)` 实现展平 | |
| # - 然后通过线性层映射到128维 | |
| # | |
| # 这是典型的CNN架构:卷积层提取特征,最后用全连接层进行分类。 | |
| # 让我详细解释28这个数字的来源: | |
| # | |
| # 28是**原始输入图像的尺寸**,不是池化得到的。具体流程如下: | |
| # | |
| # 1. **原始输入**:MNIST图像尺寸是 28×28 像素 | |
| # - 输入形状:(batch_size, 1, 28, 28) | |
| # | |
| # 2. **卷积和池化过程**: | |
| # - [conv1](file://D:\class\python\transformerdemo\PyTorch\手写数字识别CNN.py#L84-L84): 28×28 → 28×28 (卷积保持尺寸,padding=1) | |
| # - [pool](file://D:\class\python\transformerdemo\PyTorch\手写数字识别CNN.py#L87-L87): 28×28 → 14×14 (2×2最大池化,步幅2) | |
| # - [conv2](file://D:\class\python\transformerdemo\PyTorch\手写数字识别CNN.py#L85-L85): 14×14 → 14×14 (卷积保持尺寸,padding=1) | |
| # - [pool](file://D:\class\python\transformerdemo\PyTorch\手写数字识别CNN.py#L87-L87): 14×14 → 7×7 (2×2最大池化,步幅2) | |
| # | |
| # 3. **最终特征图**: | |
| # - 经过两次池化后,空间尺寸从28×28缩小到7×7 | |
| # - 通道数从1增加到64(经过conv1: 1→32, conv2: 32→64) | |
| # | |
| # 所以28是起始尺寸(MNIST原始图像大小),经过CNN处理后变成了64×7×7的特征图。 | |
| # 隐藏层选择128这个数值是基于以下几个原因: | |
| # | |
| # ### 1. **经验性选择** | |
| # - 128是深度学习中常用的隐藏层大小 | |
| # - 在模型复杂度和计算效率之间取得平衡 | |
| # | |
| # ### 2. **维度递减策略** | |
| # - 输入:64×7×7 = 3136 维 | |
| # - 隐藏层:128 维 | |
| # - 输出:10 维(10个数字类别) | |
| # - 逐步降维,符合典型网络设计原则 | |
| # | |
| # ### 3. **容量控制** | |
| # - 3136 → 128 是大幅降维,有助于提取最重要特征 | |
| # - 128 足够大以保留足够的信息进行分类 | |
| # - 128 相对较小,有助于防止过拟合 | |
| # | |
| # ### 4. **计算效率** | |
| # - 128 是2的幂次(2^7),在某些硬件上计算更高效 | |
| # - 参数量适中:3136×128 = 约40万参数,不会过大 | |
| # | |
| # 这是典型的工程实践选择,通过经验验证在许多任务上表现良好。实际应用中也可以尝试其他值(如64、256等)来寻找最优配置。 | |
| self.fc2 = nn.Linear(128, 10) | |
| self.relu = nn.ReLU() | |
| # 正则化:防止过拟合 | |
| self.dropout = nn.Dropout(0.2) | |
| # 这段代码创建一个Dropout层,丢弃率为0.2(20 %): | |
| # - 功能:在训练期间随机将20 % 的神经元输出置为0 | |
| # - 目的:防止模型过拟合,增强泛化能力 | |
| # - 机制:训练时随机失活神经元,测试时不生效 | |
| # - 优点:简单有效,减少神经元间过度依赖 | |
| # Dropout是重要的正则化技术,通过随机失活部分神经元来提升模型的泛化性能。 | |
| def forward(self, x): | |
| # 卷积+池化:1×28×28 → 64×7×7 | |
| x = self.pool(self.relu(self.conv1(x))) | |
| x = self.pool(self.relu(self.conv2(x))) | |
| # 展平:64×7×7 → 3136 | |
| x = x.view(-1, 64 * 7 * 7) | |
| # 全连接+dropout | |
| x = self.dropout(self.relu(self.fc1(x))) | |
| x = self.fc2(x) | |
| return x | |
| # 2. 加载训练好的模型 | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = MNISTCNNModel().to(device) | |
| # model.load_state_dict(torch.load("final_mnist_model.pth")) | |
| optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam优化器调参数 | |
| def evaluate_model(model, test_loader, device): | |
| model.eval() | |
| correct = 0 | |
| total = 0 | |
| with torch.no_grad(): | |
| for data, target in test_loader: | |
| data, target = data.to(device), target.to(device) | |
| outputs = model(data) | |
| _, predicted = torch.max(outputs.data, 1) | |
| total += target.size(0) | |
| correct += (predicted == target).sum().item() | |
| accuracy = 100 * correct / total | |
| return accuracy | |
| best_accuracy = 0 | |
| print(f'Last Accuracy: {best_accuracy:.2f}%') | |
| for epoch in range(epochs): | |
| model.train() | |
| running_loss = 0.0 | |
| for batch_idx, (data, target) in enumerate(train_loader): | |
| data, target = data.to(device), target.to(device) | |
| optimizer.zero_grad() | |
| output = model(data) | |
| loss = criterion(output, target) | |
| loss.backward() | |
| optimizer.step() | |
| running_loss += loss.item() | |
| print(f'Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}') | |
| # 每5轮进行一次评估 | |
| if (epoch + 1) % 5 == 0: | |
| accuracy = evaluate_model(model, test_loader, device) | |
| print(f'Epoch [{epoch + 1}/{epochs}] - Test Accuracy: {accuracy:.2f}%') | |
| # 更新最佳模型 | |
| if accuracy > best_accuracy: | |
| best_accuracy = accuracy | |
| torch.save(model.state_dict(), 'best_mnist_model.pth') | |
| # 如果准确率达到99%,停止训练 | |
| if accuracy >= 99: | |
| print(f'Accuracy reached {accuracy:.2f}%, stopping training...') | |
| break | |
| print(f'Training completed. Best accuracy: {best_accuracy:.2f}%') | |
| # 保存模型的训练结果文件 | |
| torch.save(model.state_dict(), 'final_mnist_model.pth') | |
| # 测试模型训练成果 | |
| # 替换成你的手写数字图片路径(比如"my_digit_7.png") | |
| image_path = "8.png" | |
| # 预处理图片 | |
| tensor_img, processed_img = preprocess_custom_image(image_path) | |
| # 预测 | |
| with torch.no_grad(): | |
| tensor_img = tensor_img.to(device) | |
| output = model(tensor_img) | |
| _, predicted = torch.max(output, 1) | |
| # 打印预测结果 | |
| print(f"自定义图片预测结果:{predicted.item()}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment