danjamker · July 2, 2014 16:14
diff --git a/LineClean b/LineClean
        tmp = WhitespaceTokenizer().tokenize(line)

        t = []

        #concatonate singe letter togeater e.g. N A M E would be NAME
        con = ""
        for word in tmp:
            if len(word) == 1:
                con += word
            else:
                if len(con) > 0:
                    t.append(con)
                    con = ""

                if len(word) > 1:
                    t.append(word)

        #if a word contains a hiphen andwithin out the hifen it is in the dict is the then replaced.
        for word in tmp:
            if "-" in word:
                if self.wordsfile.__contains__(word.replace("-","")):
                    word = word.replace("-","")

        #If the word ends with a hipen it is cheked to see if the complete word is real, if not adds one hipen.
        tmp2 = []
        ttt = False
        for prev, item, next in self.neighborhood(tmp):
            if ttt == False:
                if item[-1] == "-":
                    if self.wordsfile.__contains__(item[0:-1]+next):
                        tmp2 += item[0:-1]+next
                        ttt = True
                    else:
                        tmp2 += item+next
                        ttt = True
                else:
                    tmp2.append(word)
            else:
                ttt = False

        tmp3 = []
        for t in tmp2:
            for prev, item, next in self.neighborhood(re.findall(r"[\w']+", t)):
                if self.wordsfile.__contains__(item+next):
                    tmp3.append(item+next)
                elif self.wordsfile.__contains__(item):
                    tmp3.append(item)

                print item

        #joins all words back to a string and remoces all punctualtion from the string.
        exclude = set(string.punctuation)
        return ''.join(ch for ch in "".join(tmp3) if ch not in exclude)
	tmp = WhitespaceTokenizer().tokenize(line)

	t = []

	#concatonate singe letter togeater e.g. N A M E would be NAME
	con = ""
	for word in tmp:
	if len(word) == 1:
	con += word
	else:
	if len(con) > 0:
	t.append(con)
	con = ""

	if len(word) > 1:
	t.append(word)

	#if a word contains a hiphen andwithin out the hifen it is in the dict is the then replaced.
	for word in tmp:
	if "-" in word:
	if self.wordsfile.__contains__(word.replace("-","")):
	word = word.replace("-","")

	#If the word ends with a hipen it is cheked to see if the complete word is real, if not adds one hipen.
	tmp2 = []
	ttt = False
	for prev, item, next in self.neighborhood(tmp):
	if ttt == False:
	if item[-1] == "-":
	if self.wordsfile.__contains__(item[0:-1]+next):
	tmp2 += item[0:-1]+next
	ttt = True
	else:
	tmp2 += item+next
	ttt = True
	else:
	tmp2.append(word)
	else:
	ttt = False

	tmp3 = []
	for t in tmp2:
	for prev, item, next in self.neighborhood(re.findall(r"[\w']+", t)):
	if self.wordsfile.__contains__(item+next):
	tmp3.append(item+next)
	elif self.wordsfile.__contains__(item):
	tmp3.append(item)

	print item

	#joins all words back to a string and remoces all punctualtion from the string.
	exclude = set(string.punctuation)
	return ''.join(ch for ch in "".join(tmp3) if ch not in exclude)