Created
June 7, 2022 14:34
-
-
Save code-simple/6ae99490cba84b595ea76d270d1f352f to your computer and use it in GitHub Desktop.
Webscrapping using BASH
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This is example of how your can Webscrap using curl , Here i Webscraped my Gas and Electricity and Got all neccessary Information , Just change LINKS to get your corresponding bills to yours. | |
GASFILE='file:///tmp/gas.html' | |
GASLINK="https://www.sngpl.com.pk/web/viewbill?consumer=02613930003&proc=viewbill&contype=NewCon" | |
electricityLINK="https://bill.pitc.com.pk/pescobill/general?refno=08262140100430" | |
electricityFILE='/tmp/e.txt' | |
# TOOLS USED HERE: html2text - grep - wget | |
# Download and Convert ELECTRICTITY to Text electricity bill for easy extraction | |
wget -O /tmp/e.html $electricityLINK | |
html2text /tmp/e.html > /tmp/e.txt | |
# Download Gas file to work with offline (Prefered method. Fast processing speed) | |
wget -O /tmp/gas.html $GASLINK | |
function g_currentBill(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
hxselect -c '.txt-bld > td:first-child' | |
} | |
function g_dueDate(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' | | |
head -5 | | |
tail -n 1 | |
} | |
function g_billAfterDueDate(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' | | |
head -4 | | |
tail -n 1 | |
} | |
function g_accountNumber(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' | | |
head -2 | | |
tail -n 1 | |
} | |
function g_dues(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
grep -m 1 -o -P '(?<=<td class="txt-rt">).*(?=</td>)' | |
} | |
function name(){ | |
curl -s $GASFILE | | |
hxnormalize -x | | |
hxselect -c '.data-td-en.bdr-bt' | |
} | |
function g_billingMonth(){ | |
curl -s $GASFILE | | |
grep -o -P '(?<=Billing Month</td><td>).*(?=</td></tr><tr height=)' | | |
cut -c 1-8 # Here i Cut because i wasn't able to retrieve only month and year | |
} | |
function e_cb(){ | |
cat $electricityFILE | grep "^PAYABLE WITHIN$***" | |
} | |
function e_afterDueDate(){ | |
cat $electricityFILE | grep "^PAYABLE AFTER$***" | |
} | |
function e_lastDate(){ | |
echo Last Date : | |
cat $electricityFILE | grep "[0-9]" | | |
head -2 | # Trick to access line number | |
awk 'END {print $(NF-2),$(NF-1),$NF}' # Trick to print last few words | |
} | |
function e_bmonth(){ | |
echo Billing Month : | |
cat $electricityFILE | grep "[0-9]" | | |
head -2 | # Trick to access line number | |
awk 'END {print $(NF-10),$(NF-9)}' | |
} | |
function e_name(){ | |
echo NAME : | |
cat $electricityFILE | grep [A-Z] | | |
head -n 24 | | |
tail -n -1 | | |
awk '{print $1,$2}' | |
} | |
function e_ref(){ | |
echo Reference No. | |
cat $electricityFILE | grep [A-Z] | | |
head -n 14 | | |
tail -n -1 | |
} | |
clear | |
echo --------------------------- ELECTRICITY BILL ------------------ | |
echo $(e_name) | |
echo $(e_ref) | |
echo $(e_bmonth) | |
echo $(e_cb) | |
echo $(e_afterDueDate) | |
echo $(e_lastDate) | |
echo --------------------------------------------------------------- | |
echo | |
echo | |
echo | |
echo | |
echo | |
echo --------------------------- GAS BILL -------------------------- | |
echo $(name) | |
echo ACCOUNT NUMBER : $(g_accountNumber) | |
echo Billing Month : $(g_billingMonth) | |
echo CURRENT BILL : Rs. $(g_currentBill) /- | |
echo ARREARS : Rs. $(g_dues) /- | |
echo LATE BILL : Rs. $(g_billAfterDueDate) /- | |
echo DUE DATE : $(g_dueDate) | |
echo ---------------------------------------------------------------- | |
#DELETE EVERY FILE IN TMP DIR | |
rm /tmp/gas.html /tmp/e.txt /tmp/e.html # Delete After working | |
sleep 50000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment