itsrifat · August 29, 2015 14:12
diff --git a/captcha_main.py b/captcha_main.py
 #build a model with your favourite classifier !
 from sklearn.linear_model import LogisticRegression

 #read the raw training data
 data = readData(directory)
 #prepare the training data from the raw data
 trainingdata =  prepareTrainingData(data)

 #create the classifier
 clf = LogisticRegression()
 clf.fit(trainingdata[0], trainingdata[1])

 #now the classifier can be used to predict any new captcha!
diff --git a/prep_data.py b/prep_data.py
 def prepareTrainingData(data):
    train_x = []
    train_y = []
    #set a threshold for value to remove the background
    threshold = 100
    for d in data:
        img = d[0]
        output = list(d[1])
        i=0
        #cut the image into 5 pices !
        startLocX = 7
        endLocX = 21
        startLocY = 6
        increment = 8
        for i in range(5):
            char = img[startLocX:endLocX,startLocY:startLocY+increment]
            #remove the background by thresholding
            #turn the background pixels to white
            char[char > threshold] = 255
            startLocY = startLocY + increment + 1
            train_x.append(char.ravel())
            train_y.append(output[i])
    return (train_x,train_y)
diff --git a/read_data.py b/read_data.py
 import os
 import numpy as np

 def readImageFile(imgLoc):
    '''
    Reads one image file from imgLoc
    '''
    pixels = []
    with open(imgLoc) as f:
        next(f)
        for line in f:
            row = []
            for p in line.split(" "):
                vals = [int(val) for val in p.split(",")]
                row.append(sum(vals)/len(vals))
            pixels.append(row)
    return np.array(pixels,dtype = np.uint8)

 def readData(directory):
    '''
    Reads all captcha files and their labels from directory. directory should have this structure
    as provided by the problem:
    -directory
        -input
            -0.txt
            -1.txt
            -...
        -output
            -0.txt
            -1.txt
            -...
    '''
    images = []
    for root, dirs, files in os.walk(directory+"input/", topdown=False):
        files = [fi for fi in files if fi.endswith(".txt")]
        for name in files:
            outputfileName =  "output"+str(name[5:].split(".")[0])+".txt"
            imgdata = readImageFile(os.path.join(root, name))
            with open(directory+"output/"+outputfileName) as outf:
                output = next(outf).strip()
            images.append((imgdata,output))
    return images
	#build a model with your favourite classifier !
	from sklearn.linear_model import LogisticRegression

	#read the raw training data
	data = readData(directory)
	#prepare the training data from the raw data
	trainingdata = prepareTrainingData(data)

	#create the classifier
	clf = LogisticRegression()
	clf.fit(trainingdata[0], trainingdata[1])

	#now the classifier can be used to predict any new captcha!
	def prepareTrainingData(data):
	train_x = []
	train_y = []
	#set a threshold for value to remove the background
	threshold = 100
	for d in data:
	img = d[0]
	output = list(d[1])
	i=0
	#cut the image into 5 pices !
	startLocX = 7
	endLocX = 21
	startLocY = 6
	increment = 8
	for i in range(5):
	char = img[startLocX:endLocX,startLocY:startLocY+increment]
	#remove the background by thresholding
	#turn the background pixels to white
	char[char > threshold] = 255
	startLocY = startLocY + increment + 1
	train_x.append(char.ravel())
	train_y.append(output[i])
	return (train_x,train_y)
	import os
	import numpy as np

	def readImageFile(imgLoc):
	'''
	Reads one image file from imgLoc
	'''
	pixels = []
	with open(imgLoc) as f:
	next(f)
	for line in f:
	row = []
	for p in line.split(" "):
	vals = [int(val) for val in p.split(",")]
	row.append(sum(vals)/len(vals))
	pixels.append(row)
	return np.array(pixels,dtype = np.uint8)

	def readData(directory):
	'''
	Reads all captcha files and their labels from directory. directory should have this structure
	as provided by the problem:
	-directory
	-input
	-0.txt
	-1.txt
	-...
	-output
	-0.txt
	-1.txt
	-...
	'''
	images = []
	for root, dirs, files in os.walk(directory+"input/", topdown=False):
	files = [fi for fi in files if fi.endswith(".txt")]
	for name in files:
	outputfileName = "output"+str(name[5:].split(".")[0])+".txt"
	imgdata = readImageFile(os.path.join(root, name))
	with open(directory+"output/"+outputfileName) as outf:
	output = next(outf).strip()
	images.append((imgdata,output))
	return images