Created
May 29, 2020 02:32
-
-
Save nihilismus/e9c2d0b4c1dffd77d432530cd0ba86c9 to your computer and use it in GitHub Desktop.
Mini Scraper 2 - https://pywombat.com/exercises/e770767b
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Mini Scraper 2 | |
# https://pywombat.com/exercises/e770767b | |
# Author: Antonio Hernández Blas <hba.nihilismus<at>gmail.com> | |
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
# Version 2, December 2004 | |
# | |
# Copyright (C) 2004 Sam Hocevar <[email protected]> | |
# | |
# Everyone is permitted to copy and distribute verbatim or modified | |
# copies of this license document, and changing it is allowed as long | |
# as the name is changed. | |
# | |
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
# | |
# 0. You just DO WHAT THE FUCK YOU WANT TO. | |
NUMBER_OF_THREADS_IN_A_SET_OF_THREADS=20 | |
SECONDS_TO_WAIT_BEFORE_EXECUTING_A_NEW_SET_OF_THREADS=5 | |
get_name() { | |
echo "$@" \ | |
| grep -E '^<h1>' \ | |
| sed -e 's_<h1>__' -e 's_</h1>__' | |
} | |
get_species() { | |
echo "$@" \ | |
| grep -E '^<td>.*</td>' \ | |
| sed -e 's_<td>__' -e 's_</td>__' \ | |
| sort \ | |
| uniq \ | |
| sed 's/—//g' \ | |
| xargs | |
} | |
get_types() { | |
echo "$@" \ | |
| grep 'type-icon' \ | |
| sed 's_</a>_\n_g' \ | |
| grep 'type-icon' \ | |
| sed 's/^.*">//' \ | |
| sort \ | |
| uniq \ | |
| xargs | |
} | |
get_and_print_info() { | |
info="$( | |
lynx -source "$1" \ | |
| grep -E -A 2 '^<h1>|^<th>Type|^<th>Species' \ | |
| grep -v Generation | |
)" | |
cat <<EOF | |
name: $(get_name "$info") - species: $(get_species "$info") - types: $(get_types "$info") | |
EOF | |
} | |
pokemones="$( | |
lynx -dump https://pokemondb.net/pokedex/all \ | |
| grep 'https://pokemondb.net/pokedex/' \ | |
| grep -vE 'pokedex/all|pokedex/game' \ | |
| sed 's/^.*https/https/' \ | |
| sort \ | |
| uniq | |
)" | |
i=0 | |
for pokemon in $pokemones; do | |
if [ $i -lt $NUMBER_OF_THREADS_IN_A_SET_OF_THREADS ]; then | |
get_and_print_info $pokemon & | |
i=$(( $i + 1 )) | |
else | |
sleep $SECONDS_TO_WAIT_BEFORE_EXECUTING_A_NEW_SET_OF_THREADS | |
i=0 | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment