Created
May 22, 2014 16:58
-
-
Save yunake/c3ef87455c84b9ac9513 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
This module provides two functions, transform() & untransform(), and the second | |
one should reverse the output of the first one. Since the transformation is | |
lossy, just provide a list of strings that satisfy the conditions, any one of | |
them might've been the original source and there is no way to know for sure. | |
The transformation is as follows: assume an English-language string as input, | |
in every word shuffle the letters in random order, then remove all whitespace | |
to create one long continuous string that looks like abracadabra, e.g.: | |
"a good cat" -> "adogoatc" or "aoodgcta" | |
If run directly from the terminal, execute the tests. | |
Python 2.5+ and 3.0+ compatible. | |
Copyright (c) 2014, Eugene Yunak <[email protected]> | |
All rights reserved. | |
Redistribution and use in source and binary forms, with or without modification, | |
are permitted provided that the following conditions are met: | |
1. Redistributions of source code must retain the above copyright notice, this | |
list of conditions and the following disclaimer. | |
2. Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation and/or | |
other materials provided with the distribution. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR | |
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON | |
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
""" | |
from __future__ import with_statement | |
from random import shuffle | |
from itertools import product | |
import six | |
def transform(string): | |
""" | |
Perform the transformation described above, on the input string. | |
Strip non-alpha characters, make lowercase. | |
Return resulting string. | |
""" | |
# String needs to be all lowercase and alpha+whitespace only. | |
string = string.lower() | |
string = ''.join(filter(lambda c: str.isalpha(c) or str.isspace(c), string)) | |
words_in = string.split() | |
words_out = list() | |
for word in words_in: | |
# random.shuffle only works on a list, does the transformations in-place | |
# and does not return it, so we need to convert to a temporary list | |
word = list(word) | |
shuffle(word) | |
words_out.append(''.join(word)) | |
return ''.join(words_out) | |
def untransform(string): | |
""" | |
Reverse the transformation described above. | |
Input is a string processed via transform(). | |
Assume English language, no non-alpha characters, lowercase. | |
Use a hard-coded system-wide dictionary as a list of English words. | |
Return a list of all possible strings that satisfy the transformation. | |
""" | |
# This function defines a number of helper functions and then just runs | |
# `get_words()` as the starting point. This is used just to isolate the | |
# namespace, the functions aren't real closures since they don't reference | |
# nonlocal variables, in fact this function does not have any. | |
def isword(letters, words={}): | |
""" | |
Checks if the given combination of letters can form a word. | |
Returns a list of all possible words, otherwise None. | |
""" | |
# Populate the words dict once. Since `words` is a default parameter, | |
# so it's initialised on definition, and since it's a mutable object | |
# and I use update() to change it in place, the changes persist between | |
# executions. | |
if not words: | |
words.update(make_words()) | |
try: | |
return words[sort_word(letters)] | |
except KeyError: | |
return False | |
def sort_word(letters): | |
""" | |
Return a string of aphabetically sorted letters. E.g.: | |
'navi' -> 'ainv', 'dendi' -> 'ddein' | |
""" | |
return ''.join(sorted(letters)) | |
def make_words(): | |
""" | |
Populate the word list from a hardcoded systemwide list. | |
Store as dict, key is the word letters sorted alphabetically, | |
value is a list of all words that can be made out of same letters. | |
""" | |
DICTFILE = "/usr/share/dict/words" | |
worddict = {} | |
with open(DICTFILE, 'r') as wordfile: | |
for word in wordfile: | |
# Skip erroneous entries, the dict has extra entries for plural. | |
# Lots of junk in there, have to ignore short words | |
# because it has pretty much all letter combos <= 2 letters. | |
if word[-2:] == "'s": # or len(word) <= 2: # temp turn off | |
continue | |
# Word needs to be lowercase and alpha. | |
word = word.lower() | |
word = ''.join(filter(str.isalpha, word)) | |
key = sort_word(word) | |
try: | |
worddict[key] | |
except KeyError: | |
worddict[key] = [] | |
if word not in worddict[key]: | |
worddict[key].append(word) | |
return worddict | |
def string_product(*args): | |
""" | |
Combine elements of all supplied lists into a list of strings, | |
with elements separated by space, covering all permutations. E.g.: | |
['a', 'b'], ['x', 'y'], ['f', 'h'] -> | |
[ 'a x f', 'a x h', 'a y f', 'a y h', | |
'b x f', 'b x h', 'b y f', 'b y h' ] | |
""" | |
return [' '.join(i) for i in product(*args)] | |
def get_words(string): | |
""" | |
Recursive function that accepts string as input and tries to split | |
it into English words, assumes the transformation described above. | |
Returns a list of strings, each string represents | |
one possible combination of words. | |
Returns an empty list of no possible combinations were found. | |
""" | |
output = [] | |
# Go through slices out of the string, one more character at a time, | |
# and see if there are words that can be formed out of these letters. | |
for index in range(len(string)): | |
found_words = isword(string[:index+1]) | |
# If there are no words out of these letters, just move on and add | |
# one more letter to the slice on the next cycle. | |
if found_words: | |
# Is there more string to process once we cut off our word? | |
if string[index+1:]: | |
# Process the rest of the string. | |
rest_of_words = get_words(string[index+1:]) | |
if rest_of_words: | |
output += string_product(found_words, rest_of_words) | |
# If there is more string to process but no words can be | |
# fitted in there, then the word we found does not fit us, | |
# just move on to the next, bigger, slice. | |
# If we got a slice that equals the lenght of the string, and | |
# we found words that fit that slice, they are our only result. | |
else: | |
output += found_words | |
# If we didn't find any combination of words and rest_of_words to fit | |
# the whole lenght of the string, the output list will remain empty. | |
return output | |
# Run the motherfucker! | |
return get_words(string) | |
if __name__ == '__main__': | |
# Simply run the tests | |
from string import ascii_letters | |
from unittest import TestCase | |
from unittest import main as tests_main | |
class TestTransformations(TestCase): | |
def setUp(self): | |
self.strings = [ | |
"mark these words", | |
"you love this fluffy cat", | |
"special characters", | |
] | |
# TODO: non-alpha, mixed case, non-english, corner-cases | |
self.inputs = {} | |
for str_plain in self.strings: | |
str_transformed = transform(str_plain) | |
self.inputs[str_transformed] = str_plain | |
def test_transform(self): | |
for str_in in self.inputs: | |
plain = list(self.inputs[str_in]) | |
for char in str_in: | |
plain.remove(char) | |
# TODO: need to also check that the order is preserved in words | |
for char in plain: | |
self.assertNotIn(char, ascii_letters) | |
def test_untransform(self): | |
for str_in in self.inputs: | |
self.assertIn(self.inputs[str_in], untransform(str_in)) | |
tests_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment