Skip to content

Instantly share code, notes, and snippets.

@codebynumbers
Last active December 16, 2015 17:39
Show Gist options
  • Select an option

  • Save codebynumbers/5471412 to your computer and use it in GitHub Desktop.

Select an option

Save codebynumbers/5471412 to your computer and use it in GitHub Desktop.
Two different approaches to substring lookup
# ======================
# Naive search
# ======================
import time
import random
input = 'https://www.facebook.com/teefury?hc_location=stream'
bids = []
with open('/usr/share/dict/words') as fh:
for line in fh:
word = line.strip().lower()
if len(word) > 3:
bids.append(word)
start = time.time()
for i in xrange(0, len(input)):
for keyword in bids:
if input[i:].startswith(keyword):
print keyword, input[i:]
print "Elapsed %0.5f" % (time.time() - start)
# OUTPUT
facebook facebook.com/teefury?hc_location=stream
face facebook.com/teefury?hc_location=stream
book book.com/teefury?hc_location=stream
fury fury?hc_location=stream
location location=stream
cation cation=stream
stream stream
ream ream
Elapsed 2.56454
# ======================
# Trie - (memory usage savings)
# ======================
import time
import random
import marisa_trie
input = 'https://www.facebook.com/teefury?hc_location=stream'
bids = []
with open('/usr/share/dict/words') as fh:
for line in fh:
word = line.strip().lower().decode('utf-8')
if len(word) > 3:
bids.append(word)
trie = marisa_trie.Trie(bids)
start = time.time()
for i in xrange(0, len(input)):
# lookahead max 20 characters or until end of string
for j in xrange(1, min(len(input)-i+1, 20)):
if unicode("".join(input[i:i+j])) in trie:
print input[i:i+j]
print "Elapsed %0.5f" % (time.time() - start)
## OUTPUT
face
facebook
book
fury
location
cation
stream
ream
Elapsed 0.00278
# ======================
# Big hash map (memory usage could be an issue)
# ======================
import time
import random
input = 'https://www.facebook.com/teefury?hc_location=stream'
bids = {}
with open('/usr/share/dict/words') as fh:
for line in fh:
word = line.strip().lower().decode('utf-8')
if len(word) > 3:
bids[word] = True
start = time.time()
for i in xrange(0, len(input)):
for j in xrange(1, min(len(input)-i+1, 20)):
if bids.get(unicode("".join(input[i:i+j]))):
print input[i:i+j]
print "Elapsed %0.5f" % (time.time() - start)
## OUTPUT
face
facebook
book
fury
location
cation
stream
ream
Elapsed 0.00252
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment