Created
March 20, 2017 07:59
-
-
Save macabeus/ef56c3658a04e6af479d6f7af23b515b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download of captcha's image: https://www.dropbox.com/s/6dbo7k4vuqlqiqr/captchar.zip | |
# Post in my blog about this code (in Brazillian Portuguese): http://localhost:8000/output/estudo-de-caso-quebrando-um-captcha.html | |
import copy | |
import numpy as np | |
import cv2 as cv | |
import pyslibtesseract | |
tesseract_config = pyslibtesseract.TesseractConfig(psm=pyslibtesseract.PageSegMode.PSM_SINGLE_CHAR) | |
tesseract_config.add_variable('tessedit_char_whitelist', 'QWERTYUIOPASDFGHJKLZXCVBNM') | |
class Steps: | |
def __init__(self): | |
self.steps = [] | |
def append(self, current, x): | |
self.steps.append((x, len(self), copy.copy(current))) | |
# ^ isso aqui é só para no sorted/max não caírem até a imagem, o que causaria bug | |
def show(self, height, width, text='?', title='foo'): | |
padding = 5 | |
from collections import Counter | |
steps_x = [i[0] for i in self.steps] | |
most_common = Counter(steps_x).most_common(1)[0][1] | |
max_steps_x = max(steps_x) | |
space_for_text = height | |
img_place = np.zeros((height * most_common + space_for_text, | |
width * max_steps_x + (padding * (len(set(steps_x)) - 1)) + width, | |
3), | |
np.uint8) | |
steps_sorted = sorted(self.steps) | |
count_repeat = None | |
previous = None | |
for i in steps_sorted: | |
if i[0] == previous: | |
count_repeat += 1 | |
else: | |
count_repeat = 0 | |
previous = i[0] | |
img_place[count_repeat * height:(count_repeat + 1) * height, | |
i[0] * padding + i[0] * width:i[0] * padding + (i[0] + 1) * width] = i[2] | |
cv.putText(img_place, text, (10, height * most_common + height - 20), cv.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255)) | |
cv.imshow(title, img_place) | |
cv.waitKey() | |
def __getitem__(self, item): | |
return self.steps[item] | |
def __len__(self): | |
return len(self.steps) | |
def read_captcha(file_name): | |
print('reading...') | |
img_start = cv.imread(file_name) | |
height, width = img_start.shape[:2] | |
# limpar topo da imagem | |
img_start[0:15, 0:width] = np.full((15, width, 3), 255) | |
steps = Steps() | |
steps.append(img_start, 0) | |
img = copy.copy(img_start) | |
### | |
# borrar, para tirar os ruídos | |
img = cv.morphologyEx(img, cv.MORPH_CLOSE, np.ones((3, 3), np.uint8)) | |
steps.append(img, 0) | |
### | |
# deixar a imagem em tons de cinza e depois apagar os pixels fracos de mais | |
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY) | |
img = cv.cvtColor(img, cv.COLOR_GRAY2BGR) | |
for i in img: | |
for i2 in i: | |
if (np.array([230, 230, 230]) >= i2).any(): | |
i2[...] = 0 | |
else: | |
i2[...] = 255 | |
steps.append(img, 0) | |
### | |
# preciso desgrudar as letras que estiverem colodas na borda | |
img[height - 3:height, 0:width] = np.full((3, width, 3), 255) | |
img[0:height, 0:3] = np.full((height, 3, 3), 255) | |
img[0:height, width - 3:width] = np.full((height, 3, 3), 255) | |
### | |
# apagar pontos pequenos, que podem atrapalhar o tesseract | |
# partes grandes não é bom apagar, pois pode ser parte da letra essencial para o tesseract entender qual é | |
# partes pequenas costumam mais atrapalhar que ajudar | |
im2, contours, hierarchy = cv.findContours(cv.cvtColor(img, cv.COLOR_BGR2GRAY), cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE) | |
for i2 in contours: | |
(x, y), radius = cv.minEnclosingCircle(i2) | |
if radius <= 6: cv.drawContours(img, [i2], -1, (255, 255, 255), -1) steps.append(img, 0) ### # isolar cada letra letters = [] img_circulada = copy.copy(img) for i2 in contours: (x, y), radius = cv.minEnclosingCircle(i2) if radius > 16 and radius > 30: | |
print(radius) | |
center = (int(x), int(y)) | |
radius_int = int(radius + 15) | |
x_min = x - radius_int | |
y_min = y - radius_int | |
if x_min < 0: | |
x_min = 0 | |
if y_min < 0: y_min = 0 letters.append((x, img[y_min : y + radius_int, x_min : x + radius_int])) cv.circle(img_circulada, center, radius_int + 1, (0,255,0), 1) steps.append(img_circulada, 0) letters_out = [] if len(letters) > 0: | |
# ler as letras isoladas | |
loop = 0 | |
text = '' | |
for i in letters: | |
current_letter = i[1] | |
# Salvar para tesseraczar | |
cv.imwrite('letter' + str(loop) + '.png', current_letter) | |
# Pegar o valor ASCII da letra | |
new_char = rotate('letter' + str(loop) + '.png')[0] | |
if len(new_char) and new_char[0] != ' ': | |
text += new_char[0] | |
else: | |
text += '?' | |
loop += 1 | |
letters_out.append((i[0], new_char[0])) | |
letters_out = sorted(letters_out) | |
letters_only = [i[1] for i in letters_out] | |
steps.show(height, width, text=''.join(letters_only), title=file_name) | |
def rotate(file_name): | |
img = cv.imread(file_name) | |
rows, cols = img.shape[:2] | |
most_confidence = [' ', 0] | |
again = False | |
for i in range(-1, 2): | |
M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1) | |
dst = cv.warpAffine(img,M,(cols,rows)) | |
letter_height, letter_width = dst.shape[:2] | |
mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8) | |
mask[:] = 0 | |
for h in range(letter_height): | |
cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200)) | |
for w in range(letter_width): | |
cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.imwrite(str(i) + file_name, dst) | |
#print(str(i) + file_name) | |
x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name) | |
if len(x) == 0: | |
continue | |
new_char = x[0] | |
if most_confidence[1] - 3 <= new_char[1] <= most_confidence[1] + 3: | |
again = False | |
else: | |
again = True | |
if new_char[0] != ' ' and most_confidence[1] < new_char[1]: | |
most_confidence[0] = new_char[0] | |
most_confidence[1] = new_char[1] | |
#print(new_char) | |
#print('--------------------------------') | |
if most_confidence[1] < 60 or again: | |
for i in range(-5, 6): | |
M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1) | |
dst = cv.warpAffine(img,M,(cols,rows)) | |
letter_height, letter_width = dst.shape[:2] | |
mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8) | |
mask[:] = 0 | |
for h in range(letter_height): | |
cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200)) | |
for w in range(letter_width): | |
cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200)) | |
cv.imwrite(str(i) + file_name, dst) | |
#print(str(i) + file_name) | |
x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name) | |
if len(x) == 0: | |
continue | |
new_char = x[0] | |
if new_char[0] != ' ' and most_confidence[1] < new_char[1]: | |
most_confidence[0] = new_char[0] | |
most_confidence[1] = new_char[1] | |
#print(new_char) | |
#print('--------------------------------') | |
return most_confidence | |
for i in range(31): | |
read_captcha('captcha' + str(i) + '.jpg') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment