Last active
December 5, 2023 03:08
-
-
Save evi1m0/fbbdb1ba7c66cc4e1bb2 to your computer and use it in GitHub Desktop.
12306 新版验证码识别脚本 (已失效)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf8 | |
# author=evi1m0 | |
# website=linux.im | |
''' | |
12306 Captcha Picture: | |
author: Evi1m0@20150316 | |
1. Download Captcha | |
2. Pic Conver Text | |
3. Return result | |
''' | |
import re | |
import time | |
import json | |
import urllib | |
import urllib2 | |
import requests | |
from PIL import Image | |
def downloadImg(): | |
pic_file = int(time.time()) | |
pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand" | |
print '[+] Download Picture: {}'.format(pic_url) | |
try: | |
resp = requests.get(pic_url, verify=False, timeout=5) | |
except: | |
resp = requests.get(pic_url, verify=False, timeout=3) | |
with open("./12306_pic/%s.jpg"%pic_file, 'wb') as fp: | |
fp.write(resp.content) | |
return pic_file | |
def imgCut(): | |
pic_file = downloadImg() | |
pic_path = "./12306_pic/%s.jpg" % pic_file | |
pic_text_path = './12306_pic/%s_text.jpg' % pic_file | |
pic_obj = Image.open(pic_path) | |
box = (120,0,290,25) | |
region = pic_obj.crop(box) | |
region.save(pic_text_path) | |
print '[*] Picture Text Picture: {}'.format(pic_text_path) | |
return pic_path, pic_text_path | |
def ocrApi(filename): | |
# Text picture conver text. | |
upload_pic_url = "http://cn.docs88.com/pdftowordupload2.php" | |
headers_fake = { | |
'ccept': '*/*', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', | |
'Connection': 'keep-alive', | |
'Host': 'cn.docs88.com', | |
'Origin': 'http://cn.docs88.com', | |
'User-Agent': 'Mozilla/5.0 (KHTML, like Gecko) Chrome/41.0.2272.89', | |
'X-Requested-With': 'ShockwaveFlash/17.0.0.134', | |
} | |
filename_tmp = filename.split('/')[-1] | |
pic_text_content = open(filename).read() | |
para = {'Filename': filename_tmp, | |
'sourcename': filename_tmp, | |
'sourcelanguage': 'cn', | |
'desttype': 'txt', | |
'Upload': 'Submit Query',} | |
upload_pic = requests.post(upload_pic_url, data=para, files={"Filedata" : open(filename, 'rb')}, headers=headers_fake) | |
time.sleep(2) | |
text_result_url = 'http://cn.docs88.com/' + upload_pic.content[3:] | |
text_result = requests.get(text_result_url) | |
if text_result.status_code == 200: | |
print '[*] Text: {}'.format(text_result.content) | |
else: | |
print '[-] False' | |
return text_result.content | |
''' | |
baidu stu | |
author: andelf | |
''' | |
def baidu_stu_html_extract(html): | |
pattern = re.compile(r"keywords:'(.*?)'") | |
matches = pattern.findall(html) | |
if not matches: | |
return '[UNKOWN]' | |
json_str = matches[0] | |
json_str = json_str.replace('\\x22', '"').replace('\\\\', '\\') | |
result = [item['keyword'] for item in json.loads(json_str)] | |
return '|'.join(result) if result else '[UNKOWN]' | |
def baidu_stu_lookup(im): | |
url = ("http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id=" | |
"WU_FILE_0&name=233.png&type=image%2Fpng&lastModifiedDate=Mon+Mar" | |
"+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size=") | |
im.save("./query_temp_img.png") | |
raw = open("./query_temp_img.png", 'rb').read() | |
url = url + str(len(raw)) | |
req = urllib2.Request(url, raw, {'Content-Type':'image/png', 'User-Agent':UA}) | |
resp = urllib2.urlopen(req) | |
resp_url = resp.read() # return a pure url | |
url = "http://stu.baidu.com/n/searchpc?queryImageUrl=" + urllib.quote(resp_url) | |
req = urllib2.Request(url, headers={'User-Agent':UA}) | |
resp = urllib2.urlopen(req) | |
html = resp.read() | |
return baidu_stu_html_extract(html) | |
def get_sub_img(pic_text_path, x, y): | |
im = Image.open(pic_text_path) | |
assert 0 <= x <= 3 | |
assert 0 <= y <= 2 | |
WITH = HEIGHT = 68 | |
left = 5 + (67 + 5) * x | |
top = 41 + (67 + 5) * y | |
right = left + 67 | |
bottom = top + 67 | |
return im.crop((left, top, right, bottom)) | |
if __name__ == '__main__': | |
UA = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36" | |
pic_path, pic_text_path = imgCut() | |
captcha_text = ocrApi(pic_text_path) | |
dict_list = {} | |
count = 0 | |
for y in range(2): | |
for x in range(4): | |
count += 1 | |
im2 = get_sub_img(pic_path, x, y) | |
result = baidu_stu_lookup(im2) | |
dict_list[count] = result | |
print (y,x), result | |
if captcha_text.strip() > 2: | |
print '\n[*] Maybe the result of the:' | |
maybe_result = [] | |
for v in dict_list: | |
for c in range(len(unicode(captcha_text.strip(), 'utf8'))): | |
text = unicode(captcha_text, 'utf8')[c] | |
if text in dict_list[v]: | |
_str_res = '%s --- %s' % (v, dict_list[v]) | |
maybe_result.append(_str_res) | |
for r in list(set(maybe_result)): | |
print r | |
else: | |
print '[-] False' |
试了下这里给出的 ocr api,成功率很低,现在 12306 文字上会有一些干扰,这个 ocr 应付不了,大概用商用的会好很多
好的开头等于成功的一半
IOError: [Errno 2] No such file or directory: './12306_pic/1461559784.jpg'
mac,先解决了几小时解决库问题,发现运行失败。无法创建文件或者读文件吗?我看你用的也是mac呀
---解决,python新手,我以为跟其他语言一样,创建文件的时候,没有那个路径会创建那个路径,结果自己新建了一个12306文件夹就好了。文字识别率太低,不到1%感觉
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
12306 新版验证码识别脚本:好使不?