Python SVM手写数字识别

2023年6月16日上午10:07 • 人工智能 • 阅读 92

Python 基于sklearn – svm实现MNIST手写数字识别

一、数据集：MNIST

数据地址：http://yann.lecun.com/exdb/mnist/

训练数据：MNIST中的60000张图像，0-9的手写数字

测试数据：MNIST中的10000张图像，0-9的手写数字

注意：训练和测试代码直接使用了ubyte格式数据，即只对原数据进行了解压，没有先转换为png/jpg，但也附上png数据转换代码。

数据格式转换：从ubyte转换到png格式，存储格式：mnist_train>label>.png，代码如下：

提示：PIL不再支持新版本，要额外安装Pillow库

import numpy as np
import struct

from PIL import Image
import os

data_file = 'train-images.idx3-ubyte'
It's 47040016B, but we should set to 47040000B
data_file_size = 47040016
data_file_size = str(data_file_size - 16) + 'B'

data_buf = open(data_file, 'rb').read()

magic, numImages, numRows, numColumns = struct.unpack_from(
    '>IIII', data_buf, 0)
datas = struct.unpack_from(
    '>' + data_file_size, data_buf, struct.calcsize('>IIII'))
datas = np.array(datas).astype(np.uint8).reshape(
    numImages, 1, numRows, numColumns)

label_file = 'train-labels.idx1-ubyte'

It's 60008B, but we should set to 60000B
label_file_size = 60008
label_file_size = str(label_file_size - 8) + 'B'

label_buf = open(label_file, 'rb').read()

magic, numLabels = struct.unpack_from('>II', label_buf, 0)
labels = struct.unpack_from(
    '>' + label_file_size, label_buf, struct.calcsize('>II'))
labels = np.array(labels).astype(np.int64)

datas_root = 'mnist_train'
if not os.path.exists(datas_root):
    os.mkdir(datas_root)

for i in range(10):
    file_name = datas_root + os.sep + str(i)
    if not os.path.exists(file_name):
        os.mkdir(file_name)

count = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
for ii in range(numLabels):
    img = Image.fromarray(datas[ii, 0, 0:28, 0:28])
    label = labels[ii]
    file_name = datas_root + os.sep + str(label) + os.sep + \
                str(label) + '_' + str(count[label]) + '.png'
    count[label] = count[label] + 1
    # file_name = datas_root + os.sep + str(label) + os.sep + \
    #             'mnist_train_' + str(ii) + '.png'
    img.save(file_name)

data_file = 't10k-images.idx3-ubyte'
It's 7840016B, but we should set to 7840000B
data_file_size = 7840016
data_file_size = str(data_file_size - 16) + 'B'

data_buf = open(data_file, 'rb').read()

magic, numImages, numRows, numColumns = struct.unpack_from(
    '>IIII', data_buf, 0)
datas = struct.unpack_from(
    '>' + data_file_size, data_buf, struct.calcsize('>IIII'))
datas = np.array(datas).astype(np.uint8).reshape(
    numImages, 1, numRows, numColumns)

label_file = 't10k-labels.idx1-ubyte'

It's 10008B, but we should set to 10000B
label_file_size = 10008
label_file_size = str(label_file_size - 8) + 'B'

label_buf = open(label_file, 'rb').read()

magic, numLabels = struct.unpack_from('>II', label_buf, 0)
labels = struct.unpack_from(
    '>' + label_file_size, label_buf, struct.calcsize('>II'))
labels = np.array(labels).astype(np.int64)

datas_root = 'mnist_test'
if not os.path.exists(datas_root):
    os.mkdir(datas_root)

for i in range(10):
    file_name = datas_root + os.sep + str(i)
    if not os.path.exists(file_name):
        os.mkdir(file_name)

count = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
for ii in range(numLabels):
    img = Image.fromarray(datas[ii, 0, 0:28, 0:28])
    label = labels[ii]
    file_name = datas_root + os.sep + str(label) + os.sep + \
                str(label) + '_' + str(count[label]) + '.png'
    count[label] = count[label] + 1
    # file_name = datas_root + os.sep + str(label) + os.sep + \
    #             'mnist_test_' + str(ii) + '.png'
    img.save(file_name)

转换后的数据如下图

二、训练模型

import numpy as np
import struct
import pickle
from sklearn import svm
###用于做数据预处理
from sklearn import preprocessing

##读取数据集
def load_mnist_train(labels_path, images_path):
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        labels = np.fromfile(lbpath, dtype=np.uint8)
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)
    return images, labels

if __name__ == '__main__':
    ##读取训练数据
    labels_path = "train-labels.idx1-ubyte"
    images_path = "train-images.idx3-ubyte"
    train_images, train_labels = load_mnist_train(labels_path, images_path)

    ##标准化
    X = preprocessing.StandardScaler().fit_transform(train_images)
    X_train = X[0:60000]
    y_train = train_labels[0:60000]

    ##定义并训练模型
    model_svc = svm.SVC()
    model_svc.fit(X_train, y_train)
    file = open("model.pickle", "wb")
    ##保存模型
    pickle.dump(model_svc, file)
    file.close()

三、测试模型

import numpy as np
import struct
import pickle
###用于做数据预处理
from sklearn import preprocessing

def test(images_path, labels_path, modelPath):
    # 读取测试图像
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        test_labels = np.fromfile(lbpath, dtype=np.uint8)
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        test_images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(test_labels), 784)

    ##读取模型
    file = open(modelPath, "rb")
    model_svc = pickle.load(file)
    file.close()

    ##评分并预测
    x = preprocessing.StandardScaler().fit_transform(test_images)
    x_test = x[0:10000]
    y_test = test_labels[0:10000]
    num = model_svc.predict(x_test)
    for i in range(10000):
        print("Real:", y_test[i], "Predict:", num[i])
    print("Accuracy：", model_svc.score(x_test, y_test))
    return num

if __name__ == '__main__':
    images_path = "t10k-images.idx3-ubyte"
    labels_path = "t10k-labels.idx1-ubyte"
    modelPath = "model.pickle"
    num = test(images_path, labels_path, modelPath)

四、参考资料

图片格式转换： MNIST数据集格式ubyte转png_haoji007的博客-CSDN博客_ubyte

模型训练及测试：图像处理基本库的学习笔记2–SVM，MATLAB，Tensorflow下分别对mnist数据集进行训练，并且进行预测 – 灰信网（软件开发博客聚合）

sklearn-svm模型参数设置：机器学习笔记(3)-sklearn支持向量机SVM – 简书

模型保存和调用：基于sklearn的SVM模型保存与调用_hellosonny的博客-CSDN博客_svm保存模型

单个图片测试：基于svm机器学习的手写数字识别_Brinshy的博客-CSDN博客_基于svm的手写数字识别

Original: https://blog.csdn.net/weixin_43349279/article/details/124507662
Author: 跑路小饼
Title: Python SVM手写数字识别

原创文章受到原创版权保护。转载请注明出处：https://www.johngo689.com/623227/

转载文章受原作者版权保护。转载请注明原作者出处！

人工智能

【自取】最近整理的，有需要可以领取学习：

Linux核心资料大放送~

全栈面试题汇总（持续更新&可下载）

一个提高学习100%效率的工具！

【超详细】深度学习面试题目！

LeetCode Python刷题答案下载！

LeetCode Java版刷题答案下载！

LeetCode C++ 版本，抓紧保存！

LeetCode GO语言刷题答案下载！

每个种类都整整齐齐？图片分类 STL-10 数据集使用指南

图片分类（Image Classification）是机器学习中重要且基础的任务之一，可广泛应用在各类 AI 功能中。这次给大家介绍的 STL10数据集，是图片分类任务早期常用的…

人工智能 2023年7月1日
0075
NLP 工具

抵扣说明： 1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。 Original: https:…

人工智能 2023年5月28日
0071
机器学习（七）线性回归

线性回归原理 * 回归原理第一类回归加权线性回归岭回归和逐步线形回归原理大概就是如图所示，画线的方法有很多种，我们期待这条线具有非常好的泛化，显然绿色的线就有一点过拟合…

人工智能 2023年6月17日
0061
深度学习中卷积&池化&全连接层及其参数量和计算量

面试20问 1、请你自我介绍一下你自己回答提示：一般人回答这个问题过于平常，只说姓名、年龄、爱好、工作经验，这些在简历上都有，其实，企业最希望知道的是求职者能否胜任工作，包括：最…

人工智能 2023年7月28日
0047
边缘计算：客户端 + 人工智能

有人说人工智能会是继互联网之后的下一次工业革命，不可否认，大到汽车、小到手表，AI技术已经广泛应用在我们周围，随便一个APP都试图跟AI发生点关系以证明自己的与时俱进。 AI的普及…

人工智能 2023年5月26日
0072
【语音增强】基于matlab多维谱自适应小波语音信号去噪【含Matlab源码 1972期】

⛄一、自适应小波语音信号去噪 1 引言在传输过程中，语音信号容易受到环境噪声等语音的干扰，降低了语音通信的质量，影响了语音处理系统的工作。因此，语音净化技术在现代语音通信和数字音频…

人工智能 2023年5月25日
0085
【经典永不过时】数据分析网红级别的项目案例分享【超详细】

大家早上好，本人姓吴，如果觉得文章写得还行的话也可以叫我吴老师。欢迎大家跟我一起走进数据分析的世界，一起学习！本周给大家分享的数据分析案例是泰坦尼克号幸存者预测的项目，没记错的话…

人工智能 2023年6月19日
0071
【读书笔记】《利用Python进行数据分析》第2版_第八章数据规整：连接、联合与重塑

使用PeriodIndex将数据处理后形成Idata 多时间序列的长格式，或具有两个或更多个键的数据（键date和item）使用DataFrame的 pivot方法将数据处理为按…

人工智能 2023年7月18日
0075
Python 基于OpenCV+face_recognition实现人脸捕捉与人脸识别（照片对比）

1.安装包依赖与上篇通过摄像头动态识别人脸一样，先下载好opencv-python、face-recognition，这里因为使用的是照片对比的方式，特意使用tkinter画了一…

人工智能 2023年6月25日
0075
Hopfield神经网络解决TSP问题（Java）

目录一、网络原理二、算法步骤三、代码实现（Java）四、所用jar包一、网络原理 Hopfield神经网络（HNN）是一种全互联反馈神经网络，它的每一个神经元都和其他神…

人工智能 2023年7月14日
0068
模式识别与图像处理课程实验一：图像处理实验(颜色算子实验、Susan、Harris角点检测实验、 sobel边缘算子检测实验)

模式识别与图像处理课程实验一：图像处理实验–>> 颜色算子实验、Susan、Harris角点检测实验、 sobel边缘算子检测实验一、实验内容二、颜…

人工智能 2023年7月25日
0061
录音文件的比特率的计算方法

在Android Q中创建录音文件_需要使用MediaRecorder类。首先需要在应用程序的AndroidManifest.xml _文件_中声明WRITE_EXTERNAL…

人工智能 2023年5月25日
0073
粒子群算法Python代码实现

1.引言粒子群优化算法起源于对鸟群觅食活动的分析。鸟群在觅食的时候通常会毫无征兆的聚拢，分散，以及改变飞行的轨迹，但是在不同个体之间会十分默契的保持距离。所以粒子群优化算法模拟鸟…

人工智能 2023年7月27日
0051
Swin-Ttransformer Object Detection 环境配置及训练

Swin-Ttransformer Object Detection 环境配置及训练环境配置 demo COCO训练源码地址：https://github.com/SwinTr…

人工智能 2023年7月12日
0073
没有显卡怎么使用anaconda配置tensorflow深度学习环境

没有显卡怎么使用anaconda配置tensorflow深度学习环境 🔱环境内容 🔱一、Anaconda环境配置 📍①、Anaconda的下载（可跳过） 📍②、Anaconda的安…

人工智能 2023年5月23日
00124
机器学习练习题

1.在NumPy中创建一个元素均为0的数组可以使用（）函数。 [A]A.zeros( ) B.arange( ) C.linspace( ) D.logspace( )2.通常（…

人工智能 2023年6月16日
0063

2024 年 5 月
一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

Python SVM手写数字识别

大家都在看