Last active
November 11, 2023 23:31
-
-
Save lcpz/fc02cbf6f0108259302ee4b7d9924dbe to your computer and use it in GitHub Desktop.
Sardinas-Patterson algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# Copyright 2015 Google Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# original: https://goo.gl/kkF5SY | |
"""Check whether a given variable-length code is uniquely decodable. | |
This is a direct/naive implementation of the Sardinas-Patterson algorithm. | |
It can be used to check if e.g. a given phoneme inventory yields unambiguous | |
transcriptions. | |
""" | |
def LeftQuotientOfWord(ps, w): | |
"""Yields the suffixes of w after removing any prefix in ps.""" | |
for p in ps: | |
if w.startswith(p): | |
yield w[len(p):] | |
return | |
def LeftQuotient(ps, ws): | |
"""Returns the set of suffixes of any word in ws after removing any prefix | |
in ps. This is the quotient set which results from dividing ws on the | |
left by ps.""" | |
qs = set() | |
for w in ws: | |
for q in LeftQuotientOfWord(ps, w): | |
qs.add(q) | |
return qs | |
def IsUniquelyDecodable(cs): | |
"""Checks if the set of codewords cs is uniquely decodable via the | |
Sardinas-Patterson algorithm.""" | |
NL, i = len(str(cs)) * len(str(max(len(x) for x in cs))), 1 # Levenstein's upper bound for termination | |
s = LeftQuotient(cs, cs) | |
s.discard('') | |
if len(s) == 0: | |
print('Uniquely decodable prefix code.') | |
return True | |
while '' not in s and len(s & cs) == 0: | |
t = LeftQuotient(cs, s) | LeftQuotient(s, cs) | |
if t == s or i > NL + 1: | |
print('Uniquely decodable.') | |
return True | |
s = t | |
i += 1 | |
if '' in s: | |
print('Dangling empty suffix.') | |
for x in s & cs: | |
print('Dangling suffix: {}'.format(x)) | |
return False | |
# TODO: add support for power and star operators, for instance: a^2, (ab)^n, a^*b, (ab)^* | |
if __name__ == '__main__': | |
"""Usage: | |
1.a $ python sp.py | |
1.b input code words, one per line (hit enter) | |
1.c hit ctrl+d | |
2 $ python sp.py code.txt | |
where code.txt contains one code word per line | |
""" | |
import sys | |
cs = set() | |
if len(sys.argv) > 1: | |
with open(sys.argv[1]) as reader: | |
for line in reader.readlines(): | |
for c in line.split(): | |
cs.add(c) | |
else: | |
for line in sys.stdin.readlines(): | |
for c in line.split(): | |
cs.add(c) | |
sys.exit(0 if IsUniquelyDecodable(cs) else 1) |
@shrumo Thanks, fixed.
Very good job
I think this line :
NL, i = len(cs) * len(max(len(x) for x in cs)), 1
Should be like:
NL, i = len(str(cs)) * len(str(max(len(x) for x in cs))), 1
Thank you
@Tareq-mis done.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For codewords:
0 1 000000
The answer should be that it is not uniquely decodable. (The word 000000 is ambigious.)
The solution here marks it at as uniquely decodable.
The reason for that is probably the Levenstein's upper bound for termination. I could not find the paper with mentioned paper bound but I think it should be:
NL, i = len(cs) * max(len(x) for x in cs), 1 # Levenstein's upper bound for termination
instead of:
NL, i = len(cs) * len(max(cs)), 1 # Levenstein's upper bound for termination
in line 45.
This change takes the word of biggest length, instead of the word first lexicographically.