Skip to content

Instantly share code, notes, and snippets.

@mtao
Last active March 28, 2018 18:21
Show Gist options
  • Save mtao/9fd800fae6efd39f70154924e9abc3d0 to your computer and use it in GitHub Desktop.
Save mtao/9fd800fae6efd39f70154924e9abc3d0 to your computer and use it in GitHub Desktop.
emulator for some algorithms problem with solution (scrape an entire database when only able to query {prefix -> first N lexicographical matches} for some static N). Base strategy is to use lexicographical depth-first search with some shortcutting (when there's N results we can hope there's more and attach ourselves to a longer common prefix).
if [ ! -e words.txt ]; then
wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt > /dev/null 2&>1
cat words.txt | tr '[:upper:]' '[:lower:]'| sort -u | grep -v "-" | grep -v "\." | grep -v "'" | grep -v "/" | grep -v "&" > /tmp/words_sorted.txt
fi
rm /tmp/my_dict.txt
prefix="$1"
grep "^$1" words.txt | head -n 5
charset=$( cat /tmp/words_sorted.txt | grep -o . | sort -u | tr -d "\n" )
charset_size=${#charset}
return_size=5
function oracle() {
prefix="$1"
grep "^$1" /tmp/words_sorted.txt | head -n $return_size
}
function get_char() {
echo ${charset:$1:1}
}
function char_index() {
expr $( expr index "$charset" "$1" ) - 1
}
#max similar substring from the beginning
function max_similar_substring_len() {
s1="$1"
s2="$2"
if [ ${#s1} -lt ${#s2} ]; then
len=${#s1}
else
len=${#s2}
fi
for m in $( seq 0 $( expr ${len} - 1 ) )
do
if [ ${s1:${m}:1} != ${s2:${m}:1} ];
then
echo $m
return
fi
done
echo $len
}
#current prefix starts with the first character
current_prefix=${charset:0:1}
#last character in current prefix
function prefix_end() {
prefix_last_index=$( expr ${#current_prefix} - 1 )
#last character of prefix
echo ${current_prefix:${prefix_last_index}:1}
}
#remove last character in prefix
function pop_prefix() {
prefix_last_index=$( expr ${#current_prefix} - 1 )
#Pop a prefix
new_current_prefix="${current_prefix:0:${prefix_last_index}}"
current_prefix="${new_current_prefix}"
}
#lexicographical increment
function increment_prefix() {
start_prefix=${current_prefix}
if [ "${#current_prefix}" -eq 0 ]
then return
fi
tail=$( prefix_end )
pop_prefix
charidx=$( char_index ${tail} )
nextidx=$( expr ${charidx} + 1 )
if [ ${nextidx} -eq ${charset_size} ];
then
increment_prefix
else
nextchar=$( get_char ${nextidx} )
current_prefix="${current_prefix}${nextchar}"
fi
#>&2 echo "${start_prefix} -> ${current_prefix}"
}
function oracle_last() {
prefix="$1"
res=$( oracle $prefix )
>&2 echo $prefix: $res
echo "$res" | tail -n 1
}
function next_prefix() {
echo "Current prefix: ${current_prefix}"
res=$( oracle ${current_prefix} )
#if [ -n "${res}" ]
#then >&2 echo "current:" ${res}
#fi
len=$( echo "$res" | wc -l )
last2=$( echo "$res" | tail -n 2 )
lastnd=$( echo "$last2" | head -n 1 )
last=$( echo "$last2" | tail -n 1 )
if [ $len -eq $return_size ];
then
#write to file
echo "$res" | head -n $(expr ${return_size} - 1) >> /tmp/my_dict.txt
#new prefix is the smallest prefix not shared by the last two elements
new_prefix_len=$( expr $( max_similar_substring_len $last $lastnd ) + 1 )
new_current_prefix="${last:0:${new_prefix_len}}"
>&2 echo "Digging down: ${current_prefix} -> ${new_current_prefix}"
current_prefix="${new_current_prefix}"
else
#write to file but beware of empty newlines by empty newwords
newwords=${res}
if [ -n "${newwords}" ]; then
echo "${newwords}" >> /tmp/my_dict.txt
fi
#jump up until the word-graph until there's something to the right and start searching to that right
myprefix=${current_prefix}
while [ "$( oracle_last ${myprefix} )" = "$last" ]
do
prefix_last_index=$( expr ${#myprefix} - 1 )
#Pop a prefix
new_myprefix="${myprefix:0:${prefix_last_index}}"
myprefix="${new_myprefix}"
done
current_prefix=${current_prefix:0:$( expr ${#myprefix} + 1 )}
increment_prefix
fi
}
#char_loop_str=$( seq 0 $( expr ${charset_size} - 1 ) )
#
#for m in $char_loop_str;
#do echo "$m) $( get_char $m ) $( char_index $( get_char $m ) )"
#done
while [ ${#current_prefix} -gt 0 ]
do next_prefix
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment