Created
May 8, 2013 01:02
-
-
Save jamiees2/5537441 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Enter your code here. Read input from STDIN. Print output to STDOUT | |
| def read_words(): | |
| with open('words.txt') as f: | |
| return [line.strip().lower() for line in f] | |
| def strip_chars(word): | |
| word = word.lower().strip() | |
| if word.startswith('#'): | |
| return word[1:] | |
| if word.startswith('www'): | |
| word = word[4:] | |
| return word[:word.find('.')] | |
| def is_number(s): | |
| try: | |
| int(s) | |
| return True | |
| except ValueError: | |
| return False | |
| def make_trie(words): | |
| root = dict() | |
| for word in words: | |
| current_dict = root | |
| for letter in word: | |
| current_dict = current_dict.setdefault(letter, {}) | |
| current_dict = current_dict.setdefault(_end, _end) | |
| return root | |
| def in_trie(trie, word): | |
| current_dict = trie | |
| for letter in word: | |
| if letter in current_dict: | |
| current_dict = current_dict[letter] | |
| else: | |
| return False | |
| else: | |
| if _end in current_dict: | |
| return True | |
| else: | |
| return False | |
| def possibles(s,index=0,token=''): | |
| if index >= len(s): | |
| return [] | |
| token += s[index] | |
| index += 1 | |
| while not in_trie(trie,token) and index < len(s): | |
| token += s[index] | |
| index += 1 | |
| if not in_trie(trie,token): | |
| return [] | |
| return [token] + possibles(s,index,token) | |
| def tokenize(s): | |
| if len(s) == 0: | |
| return [] | |
| if is_number(s[0]): | |
| i = 1 | |
| token = s[0] | |
| while i < len(s) and is_number(token + s[i]): | |
| token += s[i] | |
| i += 1 | |
| if i < len(s): | |
| return [token] + tokenize(s[i:]) | |
| else: | |
| return [token] | |
| poss = possibles(s) | |
| if not poss: | |
| return [] | |
| largest = poss[-1] | |
| poss.pop() | |
| inp = s[len(largest):] | |
| if len(inp) == 0: | |
| return [largest] | |
| tokenized = tokenize(inp) | |
| while not tokenized and len(poss) > 0: | |
| largest = poss[-1] | |
| poss.pop() | |
| inp = s[len(largest):] | |
| tokenized = tokenize(inp) | |
| if not tokenized: | |
| return [] | |
| return [largest] + tokenized | |
| #words = None | |
| #read_words() | |
| _end = '__end__' | |
| words = read_words() | |
| trie = make_trie(words) | |
| #print tokenize('1234word') | |
| #exit() | |
| n = input() | |
| for x in xrange(n): | |
| word = strip_chars(raw_input()) | |
| print ' '.join(tokenize(word)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment