Last active
April 7, 2020 20:55
-
-
Save theagoliveira/9f3aa79f343c440eb03408cbfcbbb305 to your computer and use it in GitHub Desktop.
Bash script to get every result in a series of PCI Concursos searches and save to CSV files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Author: Thiago Cavalcante | |
# github.com/theagoliveira | |
for url in $(cat "./urls.txt") | |
do | |
src=$(curl -fsL "$url") | |
pages=$(echo "$src" | pup 'span[style="margin:0 30px 0 2px;"] text{}' | sed 's/Mostrando página 1 de //g') | |
institution_name="./${url/https\:\/\/www.pciconcursos.com.br\/provas\//}" | |
institution_name=${institution_name^^} | |
# mkdir "$institution_name" | |
echo URL: "$url" | |
echo PAGES: "$pages" | |
echo DIR: "$institution_name" | |
for n in $(seq 1 "$pages") | |
do | |
echo N: "$n" | |
src=$(curl -fsL "$url/$n") | |
# echo URL "$n": "$url/$n" | |
# echo SRC: "$src" | |
# exam_n=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a text{}' | sed -re 's/:/_/g' -re 's/\//-/g') | |
exam=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a text{}' | sed 's/$/\\n/g') | |
exam=$(echo "$exam" | sed 's/\\n /"\\n"/g') | |
exam=$(echo "$exam" | sed 's/\\n$//g') | |
exam=$(echo "$exam" | sed 's/^/"/g') | |
exam=$(echo "$exam" | sed 's/$/"/g') | |
# echo EXAM_N: "$exam_n" | |
# echo EXAM: "$exam" | |
# link_n=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a attr{href}') | |
link=$(echo "$src" | pup 'td[class="ca"] a, td[class="ea"] a attr{href}' | sed 's/$/\\n/g') | |
link=$(echo "$link" | sed 's/\\n /\\n/g') | |
link=$(echo "$link" | sed 's/\\n$//g') | |
# echo LINK_N: "$link_n" | |
# echo LINK: "$link" | |
# year_n=$(echo "$src" | pup 'td[class="cb"], td[class="eb"] text{}') | |
year=$(echo "$src" | pup 'td[class="cb"], td[class="eb"] text{}' | sed 's/$/\\n/g') | |
year=$(echo "$year" | sed 's/\\n /\\n/g') | |
year=$(echo "$year" | sed 's/\\n$//g') | |
# echo YEAR_N: "$year_n" | |
# echo YEAR: "$year" | |
# institution_n=$(echo "$src" | pup 'td[class="cc"] a, td[class="ec"] a text{}' | sed -re 's/\//-/g') | |
institution=$(echo "$src" | pup 'td[class="cc"] a, td[class="ec"] a text{}' | sed 's/$/\\n/g') | |
institution=$(echo "$institution" | sed 's/\\n /\\n/g') | |
institution=$(echo "$institution" | sed 's/\\n$//g') | |
# echo INSTITUTION_N: "$institution_n" | |
# echo INSTITUTION: "$institution" | |
# organizer_n=$(echo "$src" | pup 'td[class="cd"] a, td[class="ed"] a text{}' | sed -re 's/\//-/g') | |
organizer=$(echo "$src" | pup 'td[class="cd"] a, td[class="ed"] a text{}' | sed 's/$/\\n/g') | |
organizer=$(echo "$organizer" | sed 's/\\n /\\n/g') | |
organizer=$(echo "$organizer" | sed 's/\\n$//g') | |
# echo ORGANIZER_N: "$organizer_n" | |
# echo ORGANIZER: "$organizer" | |
# level_n=$(echo "$src" | pup 'td[class="ce"], td[class="ee"] text{}') | |
level=$(echo "$src" | pup 'td[class="ce"], td[class="ee"] text{}' | sed 's/$/\\n/g') | |
level=$(echo "$level" | sed 's/\\n /\\n/g') | |
level=$(echo "$level" | sed 's/\\n$//g') | |
# echo LEVEL_N: "$level_n" | |
# echo LEVEL: "$level" | |
# DOWNLOAD - NOT WORKING | |
# downlink="" | |
# for l in "$link_n" | |
# do | |
# echo "$l" | |
# src=$(curl -fsL "$l") | |
# down=$(echo "$src" | pup 'ul[class="zip"] li a attr{href}') | |
# wget -q --directory-prefix "$institution_name" "$down" | |
# downlink=${downlink}${down}\\n | |
# done | |
# downlink=$(echo "$downlink" | sed 's/\\n$//g') | |
if [ "$n" == "1" ]; then | |
# echo "PROVA,LINK,DOWNLOAD LINK,ANO,INSTITUIÇÃO,ORGANIZADORA,NÍVEL" >> "$institution_name/LISTA.csv" | |
echo "PROVA,LINK,ANO,INSTITUIÇÃO,ORGANIZADORA,NÍVEL" >> "$institution_name.csv" | |
fi | |
# paste -d "," <(echo -e "$exam") <(echo -e "$link") <(echo -e "$downlink") <(echo -e "$year") <(echo -e "$institution") <(echo -e "$organizer") <(echo -e "$level") >> "$institution_name/LISTA.csv" | |
paste -d "," <(echo -e "$exam") <(echo -e "$link") <(echo -e "$year") <(echo -e "$institution") <(echo -e "$organizer") <(echo -e "$level") >> "$institution_name.csv" | |
done | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://www.pciconcursos.com.br/provas/biblioteconomista/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment