Skip to content

Instantly share code, notes, and snippets.

@code-simple
Created June 7, 2022 14:34
Show Gist options
  • Save code-simple/6ae99490cba84b595ea76d270d1f352f to your computer and use it in GitHub Desktop.
Save code-simple/6ae99490cba84b595ea76d270d1f352f to your computer and use it in GitHub Desktop.
Webscrapping using BASH
#!/bin/bash
# This is example of how your can Webscrap using curl , Here i Webscraped my Gas and Electricity and Got all neccessary Information , Just change LINKS to get your corresponding bills to yours.
GASFILE='file:///tmp/gas.html'
GASLINK="https://www.sngpl.com.pk/web/viewbill?consumer=02613930003&proc=viewbill&contype=NewCon"
electricityLINK="https://bill.pitc.com.pk/pescobill/general?refno=08262140100430"
electricityFILE='/tmp/e.txt'
# TOOLS USED HERE: html2text - grep - wget
# Download and Convert ELECTRICTITY to Text electricity bill for easy extraction
wget -O /tmp/e.html $electricityLINK
html2text /tmp/e.html > /tmp/e.txt
# Download Gas file to work with offline (Prefered method. Fast processing speed)
wget -O /tmp/gas.html $GASLINK
function g_currentBill(){
curl -s $GASFILE |
hxnormalize -x |
hxselect -c '.txt-bld > td:first-child'
}
function g_dueDate(){
curl -s $GASFILE |
hxnormalize -x |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' |
head -5 |
tail -n 1
}
function g_billAfterDueDate(){
curl -s $GASFILE |
hxnormalize -x |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' |
head -4 |
tail -n 1
}
function g_accountNumber(){
curl -s $GASFILE |
hxnormalize -x |
grep -o -P '(?<=<td class="txt-bld">).*(?=</td>)' |
head -2 |
tail -n 1
}
function g_dues(){
curl -s $GASFILE |
hxnormalize -x |
grep -m 1 -o -P '(?<=<td class="txt-rt">).*(?=</td>)'
}
function name(){
curl -s $GASFILE |
hxnormalize -x |
hxselect -c '.data-td-en.bdr-bt'
}
function g_billingMonth(){
curl -s $GASFILE |
grep -o -P '(?<=Billing Month</td><td>).*(?=</td></tr><tr height=)' |
cut -c 1-8 # Here i Cut because i wasn't able to retrieve only month and year
}
function e_cb(){
cat $electricityFILE | grep "^PAYABLE WITHIN$***"
}
function e_afterDueDate(){
cat $electricityFILE | grep "^PAYABLE AFTER$***"
}
function e_lastDate(){
echo Last Date :
cat $electricityFILE | grep "[0-9]" |
head -2 | # Trick to access line number
awk 'END {print $(NF-2),$(NF-1),$NF}' # Trick to print last few words
}
function e_bmonth(){
echo Billing Month :
cat $electricityFILE | grep "[0-9]" |
head -2 | # Trick to access line number
awk 'END {print $(NF-10),$(NF-9)}'
}
function e_name(){
echo NAME :
cat $electricityFILE | grep [A-Z] |
head -n 24 |
tail -n -1 |
awk '{print $1,$2}'
}
function e_ref(){
echo Reference No.
cat $electricityFILE | grep [A-Z] |
head -n 14 |
tail -n -1
}
clear
echo --------------------------- ELECTRICITY BILL ------------------
echo $(e_name)
echo $(e_ref)
echo $(e_bmonth)
echo $(e_cb)
echo $(e_afterDueDate)
echo $(e_lastDate)
echo ---------------------------------------------------------------
echo
echo
echo
echo
echo
echo --------------------------- GAS BILL --------------------------
echo $(name)
echo ACCOUNT NUMBER : $(g_accountNumber)
echo Billing Month : $(g_billingMonth)
echo CURRENT BILL : Rs. $(g_currentBill) /-
echo ARREARS : Rs. $(g_dues) /-
echo LATE BILL : Rs. $(g_billAfterDueDate) /-
echo DUE DATE : $(g_dueDate)
echo ----------------------------------------------------------------
#DELETE EVERY FILE IN TMP DIR
rm /tmp/gas.html /tmp/e.txt /tmp/e.html # Delete After working
sleep 50000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment