# 算数验证码识别（运算符为中文：加、减、乘）

The situation encountered in the work that needs to crack the arithmetic verification code

Step 1: download the pictures in batch and save them to the local folder

def getImg():
i = 0
while i < 10:
html = requests.get('https://cmegsb.cma.org.cn/national_project/CheckCodeImageServlet', verify=False)
image = Image.open(BytesIO(html.content))
image.save('img_test/%s.jpg' % i)
i += 1
print(i)

def begin():
# 图片链接路径
path = r"D:\img_test"
import muggle_ocr
sdk = muggle_ocr.SDK(model_type=muggle_ocr.ModelType.OCR)
threshold_values = [115, 150, 127, 90, 120, 130, 135, 145, 155, 160, 180]
for threshold in threshold_values:
total = len(os.listdir(path))
errorCount = 0
for index, i in enumerate(os.listdir(path)):
right_str = i.split("_")[0]
print('%s / %s' % (index + 1, total), i + "->" + right_str)
yzm = cv2.imread(path + '\\' + i)  # 读图片
yzm = cv2.cvtColor(yzm, cv2.COLOR_BGR2GRAY)  # cv2 方法灰度化
# 二值化
thresh, yzm = cv2.threshold(yzm, threshold, 255, cv2.THRESH_BINARY)
# yzm:表示需要操作的数组
# threshold:表示阈值 （根据验证码图片特性 可选值：90、115、120、127、130、135、150、155、160、180）、常用【115、150、127】--- 这些都是我一个一个的测试出来的
# 255 表示最大值
# 降噪
yzm = cv2.morphologyEx(yzm, cv2.MORPH_CLOSE, np.ones(shape=(1, 1)))
# 保存灰度化的图片
cv2.imwrite('new_img.png', yzm)
img = open('new_img.png', 'rb').read()
text = sdk.predict(image_bytes=img)
if ('乘' in text or '减' in text or '加' in text) and (str(text).replace(' ', '').endswith('=?')):
# 判断是否为 正常值
b = text.replace('=', '').replace('?', '')
a = b.split('乘') if '乘' in text else b.split('加') if '加' in text else b.split('减')
a1 = a[0]
a2 = a[1]
try:
a1 = int(a1)
a2 = int(a2)
print('%s / %s' % (index + 1, total), threshold, ' 正常code: ', text)
print(a1, a2, type(a1), type(a2))
_id = hashlib.md5(str(datetime.datetime.now()).encode('utf-8')).hexdigest()
try:
# 用 a、b 代替 =、?

new_path = './img_save/%s_%s.png' % (text.replace('?', 'b').replace('=', 'a'), _id)
os.rename(path + '\\' + i, new_path)
try:
os.remove(path + '\\' + i)
except:
pass
except Exception as e:
errorCount += 1
except:
errorCount += 1
else:
errorCount += 1
print('运行结束：threshold: %s  共 %s 个, 成功：%s ， 失败：%s' % (threshold, total, total - errorCount, errorCount))

Therefore, I use this identification, filter out the valid CAPTCHA, and then mark it as a training set.

After training with this, the recognition rate is more than 95%.

Original: https://blog.csdn.net/huagangwang/article/details/125658421
Author: lewis@110
Title: 算数验证码识别（运算符为中文：加、减、乘）

