Skip to content

Instantly share code, notes, and snippets.

@mfbx9da4
Last active March 17, 2018 19:27
Show Gist options
  • Save mfbx9da4/e28962237268b3c7f21420957ce44543 to your computer and use it in GitHub Desktop.
Save mfbx9da4/e28962237268b3c7f21420957ce44543 to your computer and use it in GitHub Desktop.
Return the length of the longest common prefix for a set of strings
from timer import Timer
import matplotlib.pyplot as plt
class Solution(object):
def longestCommonPrefix(self, strings):
if not len(strings):
return 0
if len(strings) == 1:
return len(strings[0])
smallest_i = 0
smallest = strings[smallest_i]
for i, string in enumerate(strings):
if len(string) < len(smallest):
smallest_i = i
smallest = string
lo = 0
hi = len(smallest) - 1
found_prefix = False
while lo < hi:
mid = ((hi - lo + 1) // 2) + lo
broke = False
for string in strings:
if string[:mid] != smallest[:mid]:
hi = mid - 1
broke = True
break
if not broke:
found_prefix = True
lo = mid
# print('prefix', str2[:lo])
# return str2[:lo]
return lo
def longestCommonPrefix2(self, strings):
smallest = strings[0]
smallest_i = 0
for i in range(1, len(strings)):
string = strings[i]
if len(string) < len(smallest):
smallest_i = i
smallest = string
for s in range(len(smallest)):
for i, string in enumerate(strings):
if i != smallest_i and string[s] != smallest[s]:
return s
def longestCommonPrefix3(self, strings):
strings_as_bytes = [bytes(string, 'utf-8') for string in strings]
smallest = strings_as_bytes[0]
smallest_i = 0
for i in range(1, len(strings_as_bytes)):
string = strings_as_bytes[i]
if len(string) < len(smallest):
smallest_i = i
smallest = string
for s in range(len(smallest)):
for i, string in enumerate(strings_as_bytes):
if i != smallest_i and string[s] != smallest[s]:
return s
solution = Solution()
string1 = 'aaaabbbbbbbbbbbbbbbbbbbbbbbb'
string2 = 'aaaacccccccccccccccccccccccc'
with Timer('short prefix v1') as t:
for i in range(10000):
assert solution.longestCommonPrefix([string1, string2]) == 4
with Timer('short prefix v2') as t:
for i in range(10000):
assert solution.longestCommonPrefix2([string1, string2]) == 4
with Timer('short prefix v3') as t:
for i in range(10000):
assert solution.longestCommonPrefix3([string1, string2]) == 4
string1 = 'aaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbb'
string2 = 'aaaaaaaaaaaaaaaaaaaaaacccccccccccccccccccccccc'
with Timer('medium prefix v1') as t:
for i in range(10000):
assert solution.longestCommonPrefix([string1, string2]) == 22
with Timer('medium prefix v2') as t:
for i in range(10000):
assert solution.longestCommonPrefix2([string1, string2]) == 22
with Timer('medium prefix v3') as t:
for i in range(10000):
assert solution.longestCommonPrefix3([string1, string2]) == 22
string1 = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbb'
string2 = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacccccccccccccccccccccccc'
with Timer('long prefix v1') as t:
for i in range(10000):
assert solution.longestCommonPrefix([string1, string2]) == 224
with Timer('long prefix v2') as t:
for i in range(10000):
assert solution.longestCommonPrefix2([string1, string2]) == 224
with Timer('long prefix v3') as t:
for i in range(10000):
assert solution.longestCommonPrefix3([string1, string2]) == 224
v1 = []
v2 = []
v3 = []
for prefix_length in range(1000):
if prefix_length % 100 == 0:
print(prefix_length)
string1 = ('a' * prefix_length) + 'bbbbbbbbbbbbbbbbbbbbbbbb'
string2 = ('a' * prefix_length) + 'cccccccccccccccccccccccc'
with Timer(print_message=False) as t:
for i in range(10):
assert solution.longestCommonPrefix([string1, string2]) == prefix_length
v1.append(t.delta)
with Timer(print_message=False) as t:
for i in range(10):
assert solution.longestCommonPrefix2([string1, string2]) == prefix_length
v2.append(t.delta)
with Timer(print_message=False) as t:
for i in range(10):
assert solution.longestCommonPrefix3([string1, string2]) == prefix_length
v3.append(t.delta)
plt.plot(v1, label='binary search, native string comparison')
plt.plot(v2, label='compare characters')
plt.plot(v3, label='compare bytes')
plt.ylabel('time (s)')
plt.xlabel('prefix_length')
plt.legend()
plt.show()
@mfbx9da4
Copy link
Author

mfbx9da4 commented Mar 16, 2018

For shorter prefixes v2 is faster as it returns early. However, for prefixes longer than 22 characters v1 is faster.

This disparity occurs because accessing a character in the string using string[i] creates a new string, as seen in v2 . However, native string comparison using == is much faster as the Python is able to compare the internal byte representations of each character without creating new strings, as seen in v1 .

line:60 :: short prefix v1 :: 0.083 secs
line:63 :: short prefix v2 :: 0.042 secs
line:69 :: medium prefix v1 :: 0.092 secs
line:72 :: medium prefix v2 :: 0.089 secs
line:79 :: long prefix v1 :: 0.115 secs
line:82 :: long prefix v2 :: 0.548 secs

longest_common_prefix

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment