import pandas as pd import re # Path of Easypaisa archived message text file easypaisa_file = './easypaisa.txt' with open(easypaisa_file) as f: data = f.read() transaction_message = [] for sms in re.split(r"(?:\r?\n){2,}",data.strip()): if(re.search('\d+\.',sms) and 'cashback' not in sms and "Received" in sms ): transaction_message.append(sms) # Pre-Processing def clean_date(date): date = date.lower() return date.replace('[','').replace(']','').strip() def clean_trx(trx): trx = trx.lower() trx = trx.replace('trx id','').replace('.','').strip() if not trx: return "NULL" return trx def clean_amount(amount): amount = amount.lower() amount = re.sub('[^\d.]','',amount).strip() if not amount: return "NULL" if amount[0] == '.': return amount[1:] return amount def clean_sender(sender): sender = sender.lower() sender = re.sub('[^\d]','',sender).strip() if not sender: return "NULL" return sender # Check if found else return NULL def function_extract(reg): return reg.group() if reg else "NULL" # Iterate thorugh text messages and compare for temp in transaction_message: date_r = re.compile(r'\[.*\] | Trx ID \d+\.',flags=re.I | re.X) trx_r = re.compile('Trx\ ID\ \d+\.',flags=re.I | re.X) amount_r = re.compile("Received Rs.?\ \d.*\ from") sender_r = re.compile("from \w.*\ \d+\ ") sender_mobile_r = re.compile("\d+") date = clean_date(function_extract(date_r.search(temp))) tid = clean_trx(function_extract(trx_r.search(temp))) amount = clean_amount(function_extract(amount_r.search(temp))) sender = clean_sender(function_extract(sender_r.search(temp))) # Storing in list of object for later use in dataframe transactions.append({ "date":date, "tid":tid, "amount":amount, "sender": sender }) # Convert to DataFrame df = pd.DataFrame(transactions) # Perform Analysis on DF