Last active
March 28, 2018 18:21
-
-
Save mtao/9fd800fae6efd39f70154924e9abc3d0 to your computer and use it in GitHub Desktop.
emulator for some algorithms problem with solution (scrape an entire database when only able to query {prefix -> first N lexicographical matches} for some static N). Base strategy is to use lexicographical depth-first search with some shortcutting (when there's N results we can hope there's more and attach ourselves to a longer common prefix).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if [ ! -e words.txt ]; then | |
wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt > /dev/null 2&>1 | |
cat words.txt | tr '[:upper:]' '[:lower:]'| sort -u | grep -v "-" | grep -v "\." | grep -v "'" | grep -v "/" | grep -v "&" > /tmp/words_sorted.txt | |
fi | |
rm /tmp/my_dict.txt | |
prefix="$1" | |
grep "^$1" words.txt | head -n 5 | |
charset=$( cat /tmp/words_sorted.txt | grep -o . | sort -u | tr -d "\n" ) | |
charset_size=${#charset} | |
return_size=5 | |
function oracle() { | |
prefix="$1" | |
grep "^$1" /tmp/words_sorted.txt | head -n $return_size | |
} | |
function get_char() { | |
echo ${charset:$1:1} | |
} | |
function char_index() { | |
expr $( expr index "$charset" "$1" ) - 1 | |
} | |
#max similar substring from the beginning | |
function max_similar_substring_len() { | |
s1="$1" | |
s2="$2" | |
if [ ${#s1} -lt ${#s2} ]; then | |
len=${#s1} | |
else | |
len=${#s2} | |
fi | |
for m in $( seq 0 $( expr ${len} - 1 ) ) | |
do | |
if [ ${s1:${m}:1} != ${s2:${m}:1} ]; | |
then | |
echo $m | |
return | |
fi | |
done | |
echo $len | |
} | |
#current prefix starts with the first character | |
current_prefix=${charset:0:1} | |
#last character in current prefix | |
function prefix_end() { | |
prefix_last_index=$( expr ${#current_prefix} - 1 ) | |
#last character of prefix | |
echo ${current_prefix:${prefix_last_index}:1} | |
} | |
#remove last character in prefix | |
function pop_prefix() { | |
prefix_last_index=$( expr ${#current_prefix} - 1 ) | |
#Pop a prefix | |
new_current_prefix="${current_prefix:0:${prefix_last_index}}" | |
current_prefix="${new_current_prefix}" | |
} | |
#lexicographical increment | |
function increment_prefix() { | |
start_prefix=${current_prefix} | |
if [ "${#current_prefix}" -eq 0 ] | |
then return | |
fi | |
tail=$( prefix_end ) | |
pop_prefix | |
charidx=$( char_index ${tail} ) | |
nextidx=$( expr ${charidx} + 1 ) | |
if [ ${nextidx} -eq ${charset_size} ]; | |
then | |
increment_prefix | |
else | |
nextchar=$( get_char ${nextidx} ) | |
current_prefix="${current_prefix}${nextchar}" | |
fi | |
#>&2 echo "${start_prefix} -> ${current_prefix}" | |
} | |
function oracle_last() { | |
prefix="$1" | |
res=$( oracle $prefix ) | |
>&2 echo $prefix: $res | |
echo "$res" | tail -n 1 | |
} | |
function next_prefix() { | |
echo "Current prefix: ${current_prefix}" | |
res=$( oracle ${current_prefix} ) | |
#if [ -n "${res}" ] | |
#then >&2 echo "current:" ${res} | |
#fi | |
len=$( echo "$res" | wc -l ) | |
last2=$( echo "$res" | tail -n 2 ) | |
lastnd=$( echo "$last2" | head -n 1 ) | |
last=$( echo "$last2" | tail -n 1 ) | |
if [ $len -eq $return_size ]; | |
then | |
#write to file | |
echo "$res" | head -n $(expr ${return_size} - 1) >> /tmp/my_dict.txt | |
#new prefix is the smallest prefix not shared by the last two elements | |
new_prefix_len=$( expr $( max_similar_substring_len $last $lastnd ) + 1 ) | |
new_current_prefix="${last:0:${new_prefix_len}}" | |
>&2 echo "Digging down: ${current_prefix} -> ${new_current_prefix}" | |
current_prefix="${new_current_prefix}" | |
else | |
#write to file but beware of empty newlines by empty newwords | |
newwords=${res} | |
if [ -n "${newwords}" ]; then | |
echo "${newwords}" >> /tmp/my_dict.txt | |
fi | |
#jump up until the word-graph until there's something to the right and start searching to that right | |
myprefix=${current_prefix} | |
while [ "$( oracle_last ${myprefix} )" = "$last" ] | |
do | |
prefix_last_index=$( expr ${#myprefix} - 1 ) | |
#Pop a prefix | |
new_myprefix="${myprefix:0:${prefix_last_index}}" | |
myprefix="${new_myprefix}" | |
done | |
current_prefix=${current_prefix:0:$( expr ${#myprefix} + 1 )} | |
increment_prefix | |
fi | |
} | |
#char_loop_str=$( seq 0 $( expr ${charset_size} - 1 ) ) | |
# | |
#for m in $char_loop_str; | |
#do echo "$m) $( get_char $m ) $( char_index $( get_char $m ) )" | |
#done | |
while [ ${#current_prefix} -gt 0 ] | |
do next_prefix | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment