Skip to content

Instantly share code, notes, and snippets.

@theagoliveira
Last active April 7, 2020 20:55
Show Gist options
  • Save theagoliveira/9f3aa79f343c440eb03408cbfcbbb305 to your computer and use it in GitHub Desktop.
Save theagoliveira/9f3aa79f343c440eb03408cbfcbbb305 to your computer and use it in GitHub Desktop.
Bash script to get every result in a series of PCI Concursos searches and save to CSV files
#!/bin/bash
#
# Author: Thiago Cavalcante
# github.com/theagoliveira
for url in $(cat "./urls.txt")
do
src=$(curl -fsL "$url")
pages=$(echo "$src" | pup 'span[style="margin:0 30px 0 2px;"] text{}' | sed 's/Mostrando página 1 de //g')
institution_name="./${url/https\:\/\/www.pciconcursos.com.br\/provas\//}"
institution_name=${institution_name^^}
# mkdir "$institution_name"
echo URL: "$url"
echo PAGES: "$pages"
echo DIR: "$institution_name"
for n in $(seq 1 "$pages")
do
echo N: "$n"
src=$(curl -fsL "$url/$n")
# echo URL "$n": "$url/$n"
# echo SRC: "$src"
# exam_n=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a text{}' | sed -re 's/:/_/g' -re 's/\//-/g')
exam=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a text{}' | sed 's/$/\\n/g')
exam=$(echo "$exam" | sed 's/\\n /"\\n"/g')
exam=$(echo "$exam" | sed 's/\\n$//g')
exam=$(echo "$exam" | sed 's/^/"/g')
exam=$(echo "$exam" | sed 's/$/"/g')
# echo EXAM_N: "$exam_n"
# echo EXAM: "$exam"
# link_n=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a attr{href}')
link=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a attr{href}' | sed 's/$/\\n/g')
link=$(echo "$link" | sed 's/\\n /\\n/g')
link=$(echo "$link" | sed 's/\\n$//g')
# echo LINK_N: "$link_n"
# echo LINK: "$link"
# year_n=$(echo "$src" | pup 'td[class="cb"], td[class="eb"] text{}')
year=$(echo "$src" | pup 'td[class="cb"], td[class="eb"] text{}' | sed 's/$/\\n/g')
year=$(echo "$year" | sed 's/\\n /\\n/g')
year=$(echo "$year" | sed 's/\\n$//g')
# echo YEAR_N: "$year_n"
# echo YEAR: "$year"
# institution_n=$(echo "$src" | pup 'td[class="cc"] a, td[class="ec"] a text{}' | sed -re 's/\//-/g')
institution=$(echo "$src" | pup 'td[class="cc"] a, td[class="ec"] a text{}' | sed 's/$/\\n/g')
institution=$(echo "$institution" | sed 's/\\n /\\n/g')
institution=$(echo "$institution" | sed 's/\\n$//g')
# echo INSTITUTION_N: "$institution_n"
# echo INSTITUTION: "$institution"
# organizer_n=$(echo "$src" | pup 'td[class="cd"] a, td[class="ed"] a text{}' | sed -re 's/\//-/g')
organizer=$(echo "$src" | pup 'td[class="cd"] a, td[class="ed"] a text{}' | sed 's/$/\\n/g')
organizer=$(echo "$organizer" | sed 's/\\n /\\n/g')
organizer=$(echo "$organizer" | sed 's/\\n$//g')
# echo ORGANIZER_N: "$organizer_n"
# echo ORGANIZER: "$organizer"
# level_n=$(echo "$src" | pup 'td[class="ce"], td[class="ee"] text{}')
level=$(echo "$src" | pup 'td[class="ce"], td[class="ee"] text{}' | sed 's/$/\\n/g')
level=$(echo "$level" | sed 's/\\n /\\n/g')
level=$(echo "$level" | sed 's/\\n$//g')
# echo LEVEL_N: "$level_n"
# echo LEVEL: "$level"
# DOWNLOAD - NOT WORKING
# downlink=""
# for l in "$link_n"
# do
# echo "$l"
# src=$(curl -fsL "$l")
# down=$(echo "$src" | pup 'ul[class="zip"] li a attr{href}')
# wget -q --directory-prefix "$institution_name" "$down"
# downlink=${downlink}${down}\\n
# done
# downlink=$(echo "$downlink" | sed 's/\\n$//g')
if [ "$n" == "1" ]; then
# echo "PROVA,LINK,DOWNLOAD LINK,ANO,INSTITUIÇÃO,ORGANIZADORA,NÍVEL" >> "$institution_name/LISTA.csv"
echo "PROVA,LINK,ANO,INSTITUIÇÃO,ORGANIZADORA,NÍVEL" >> "$institution_name.csv"
fi
# paste -d "," <(echo -e "$exam") <(echo -e "$link") <(echo -e "$downlink") <(echo -e "$year") <(echo -e "$institution") <(echo -e "$organizer") <(echo -e "$level") >> "$institution_name/LISTA.csv"
paste -d "," <(echo -e "$exam") <(echo -e "$link") <(echo -e "$year") <(echo -e "$institution") <(echo -e "$organizer") <(echo -e "$level") >> "$institution_name.csv"
done
done
https://www.pciconcursos.com.br/provas/biblioteconomista/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment