Last active
March 17, 2018 19:27
-
-
Save mfbx9da4/e28962237268b3c7f21420957ce44543 to your computer and use it in GitHub Desktop.
Return the length of the longest common prefix for a set of strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from timer import Timer | |
import matplotlib.pyplot as plt | |
class Solution(object): | |
def longestCommonPrefix(self, strings): | |
if not len(strings): | |
return 0 | |
if len(strings) == 1: | |
return len(strings[0]) | |
smallest_i = 0 | |
smallest = strings[smallest_i] | |
for i, string in enumerate(strings): | |
if len(string) < len(smallest): | |
smallest_i = i | |
smallest = string | |
lo = 0 | |
hi = len(smallest) - 1 | |
found_prefix = False | |
while lo < hi: | |
mid = ((hi - lo + 1) // 2) + lo | |
broke = False | |
for string in strings: | |
if string[:mid] != smallest[:mid]: | |
hi = mid - 1 | |
broke = True | |
break | |
if not broke: | |
found_prefix = True | |
lo = mid | |
# print('prefix', str2[:lo]) | |
# return str2[:lo] | |
return lo | |
def longestCommonPrefix2(self, strings): | |
smallest = strings[0] | |
smallest_i = 0 | |
for i in range(1, len(strings)): | |
string = strings[i] | |
if len(string) < len(smallest): | |
smallest_i = i | |
smallest = string | |
for s in range(len(smallest)): | |
for i, string in enumerate(strings): | |
if i != smallest_i and string[s] != smallest[s]: | |
return s | |
def longestCommonPrefix3(self, strings): | |
strings_as_bytes = [bytes(string, 'utf-8') for string in strings] | |
smallest = strings_as_bytes[0] | |
smallest_i = 0 | |
for i in range(1, len(strings_as_bytes)): | |
string = strings_as_bytes[i] | |
if len(string) < len(smallest): | |
smallest_i = i | |
smallest = string | |
for s in range(len(smallest)): | |
for i, string in enumerate(strings_as_bytes): | |
if i != smallest_i and string[s] != smallest[s]: | |
return s | |
solution = Solution() | |
string1 = 'aaaabbbbbbbbbbbbbbbbbbbbbbbb' | |
string2 = 'aaaacccccccccccccccccccccccc' | |
with Timer('short prefix v1') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix([string1, string2]) == 4 | |
with Timer('short prefix v2') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix2([string1, string2]) == 4 | |
with Timer('short prefix v3') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix3([string1, string2]) == 4 | |
string1 = 'aaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbb' | |
string2 = 'aaaaaaaaaaaaaaaaaaaaaacccccccccccccccccccccccc' | |
with Timer('medium prefix v1') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix([string1, string2]) == 22 | |
with Timer('medium prefix v2') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix2([string1, string2]) == 22 | |
with Timer('medium prefix v3') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix3([string1, string2]) == 22 | |
string1 = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbb' | |
string2 = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacccccccccccccccccccccccc' | |
with Timer('long prefix v1') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix([string1, string2]) == 224 | |
with Timer('long prefix v2') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix2([string1, string2]) == 224 | |
with Timer('long prefix v3') as t: | |
for i in range(10000): | |
assert solution.longestCommonPrefix3([string1, string2]) == 224 | |
v1 = [] | |
v2 = [] | |
v3 = [] | |
for prefix_length in range(1000): | |
if prefix_length % 100 == 0: | |
print(prefix_length) | |
string1 = ('a' * prefix_length) + 'bbbbbbbbbbbbbbbbbbbbbbbb' | |
string2 = ('a' * prefix_length) + 'cccccccccccccccccccccccc' | |
with Timer(print_message=False) as t: | |
for i in range(10): | |
assert solution.longestCommonPrefix([string1, string2]) == prefix_length | |
v1.append(t.delta) | |
with Timer(print_message=False) as t: | |
for i in range(10): | |
assert solution.longestCommonPrefix2([string1, string2]) == prefix_length | |
v2.append(t.delta) | |
with Timer(print_message=False) as t: | |
for i in range(10): | |
assert solution.longestCommonPrefix3([string1, string2]) == prefix_length | |
v3.append(t.delta) | |
plt.plot(v1, label='binary search, native string comparison') | |
plt.plot(v2, label='compare characters') | |
plt.plot(v3, label='compare bytes') | |
plt.ylabel('time (s)') | |
plt.xlabel('prefix_length') | |
plt.legend() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For shorter prefixes
v2
is faster as it returns early. However, for prefixes longer than 22 charactersv1
is faster.This disparity occurs because accessing a character in the string using
string[i]
creates a new string, as seen inv2
. However, native string comparison using==
is much faster as the Python is able to compare the internal byte representations of each character without creating new strings, as seen inv1
.