Created
July 2, 2014 16:14
-
-
Save danjamker/ffb0e57470bdf64b8f3e to your computer and use it in GitHub Desktop.
Line Clean Function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tmp = WhitespaceTokenizer().tokenize(line) | |
t = [] | |
#concatonate singe letter togeater e.g. N A M E would be NAME | |
con = "" | |
for word in tmp: | |
if len(word) == 1: | |
con += word | |
else: | |
if len(con) > 0: | |
t.append(con) | |
con = "" | |
if len(word) > 1: | |
t.append(word) | |
#if a word contains a hiphen andwithin out the hifen it is in the dict is the then replaced. | |
for word in tmp: | |
if "-" in word: | |
if self.wordsfile.__contains__(word.replace("-","")): | |
word = word.replace("-","") | |
#If the word ends with a hipen it is cheked to see if the complete word is real, if not adds one hipen. | |
tmp2 = [] | |
ttt = False | |
for prev, item, next in self.neighborhood(tmp): | |
if ttt == False: | |
if item[-1] == "-": | |
if self.wordsfile.__contains__(item[0:-1]+next): | |
tmp2 += item[0:-1]+next | |
ttt = True | |
else: | |
tmp2 += item+next | |
ttt = True | |
else: | |
tmp2.append(word) | |
else: | |
ttt = False | |
tmp3 = [] | |
for t in tmp2: | |
for prev, item, next in self.neighborhood(re.findall(r"[\w']+", t)): | |
if self.wordsfile.__contains__(item+next): | |
tmp3.append(item+next) | |
elif self.wordsfile.__contains__(item): | |
tmp3.append(item) | |
print item | |
#joins all words back to a string and remoces all punctualtion from the string. | |
exclude = set(string.punctuation) | |
return ''.join(ch for ch in "".join(tmp3) if ch not in exclude) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment