Skip to content

Instantly share code, notes, and snippets.

@danjamker
Created July 2, 2014 16:14
Show Gist options
  • Save danjamker/ffb0e57470bdf64b8f3e to your computer and use it in GitHub Desktop.
Save danjamker/ffb0e57470bdf64b8f3e to your computer and use it in GitHub Desktop.
Line Clean Function
tmp = WhitespaceTokenizer().tokenize(line)
t = []
#concatonate singe letter togeater e.g. N A M E would be NAME
con = ""
for word in tmp:
if len(word) == 1:
con += word
else:
if len(con) > 0:
t.append(con)
con = ""
if len(word) > 1:
t.append(word)
#if a word contains a hiphen andwithin out the hifen it is in the dict is the then replaced.
for word in tmp:
if "-" in word:
if self.wordsfile.__contains__(word.replace("-","")):
word = word.replace("-","")
#If the word ends with a hipen it is cheked to see if the complete word is real, if not adds one hipen.
tmp2 = []
ttt = False
for prev, item, next in self.neighborhood(tmp):
if ttt == False:
if item[-1] == "-":
if self.wordsfile.__contains__(item[0:-1]+next):
tmp2 += item[0:-1]+next
ttt = True
else:
tmp2 += item+next
ttt = True
else:
tmp2.append(word)
else:
ttt = False
tmp3 = []
for t in tmp2:
for prev, item, next in self.neighborhood(re.findall(r"[\w']+", t)):
if self.wordsfile.__contains__(item+next):
tmp3.append(item+next)
elif self.wordsfile.__contains__(item):
tmp3.append(item)
print item
#joins all words back to a string and remoces all punctualtion from the string.
exclude = set(string.punctuation)
return ''.join(ch for ch in "".join(tmp3) if ch not in exclude)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment