|
import re |
|
import sys |
|
from datetime import datetime |
|
from bs4 import BeautifulSoup |
|
from openpyxl import load_workbook |
|
from openpyxl.styles.numbers import FORMAT_DATE_DDMMYY, FORMAT_NUMBER_00 |
|
|
|
class Transaction: |
|
def __init__(self, fund_name, fund_type, isin, folio, txn_data): |
|
self.fund_name = fund_name |
|
self.fund_type = fund_type |
|
|
|
# We only need ISIN for equity funds |
|
self.isin = isin if fund_type == 'MF (Equity)' else None |
|
|
|
self.folio = folio |
|
|
|
serial_no, units, purchase_date, purchase_value, \ |
|
acquisition_value, jan31_value, redemption_date, \ |
|
redemption_value, stcg, ltcg \ |
|
= txn_data |
|
|
|
self.serial_no = int(serial_no) |
|
self.units = float(units) |
|
self.purchase_date = datetime.strptime(purchase_date, '%b %d, %Y').strftime('%d/%m/%Y') |
|
self.purchase_value = float(purchase_value) |
|
self.acquisition_value = float(acquisition_value) |
|
|
|
try: |
|
self.jan31_value = float(jan31_value) |
|
except: |
|
self.jan31_value = None |
|
|
|
self.redemption_date = datetime.strptime(redemption_date, '%b %d, %Y').strftime('%d/%m/%Y') |
|
self.redemption_value = float(redemption_value) |
|
|
|
try: |
|
self.stcg = float(stcg.replace(',', '')) |
|
except: |
|
self.stcg = None |
|
|
|
try: |
|
self.ltcg = float(ltcg.replace(',', '')) |
|
except: |
|
self.ltcg = None |
|
|
|
def get_data(self): |
|
return ( |
|
self.fund_type, |
|
self.isin, |
|
self.fund_name, |
|
self.units, |
|
self.purchase_date, |
|
self.purchase_value, |
|
self.redemption_date, |
|
self.redemption_value / self.units if self.redemption_value else None, |
|
self.jan31_value / self.units if self.jan31_value else None, |
|
0.0 # Set "Transfer expenses (stamp duty, brokerage, etc.) column to 0 |
|
) |
|
|
|
# Reads all transactions from the capital gains report from Kuvera and returns the parsed data |
|
# in a tuple of (transactions, total_stcg, total_ltcg) |
|
def read_transactions(capital_gains_xls_file): |
|
fund_name_pattern = re.compile('(.*)\[ISIN') |
|
isin_pattern = re.compile('\[ISIN: (.*)\]') |
|
folio_pattern = re.compile('Folio No: (.*)') |
|
|
|
with open(capital_gains_xls_file) as f: |
|
data = f.read() |
|
soup = BeautifulSoup(data, 'html.parser') |
|
|
|
# The second table contains capital gains data |
|
capital_gains_table = soup.select('table')[1] |
|
capital_gains_tbody = capital_gains_table.select('tbody')[0] |
|
|
|
current_fund_name = None |
|
current_fund_type = None |
|
current_isin = None |
|
current_folio = None |
|
|
|
equity_subtotal = None |
|
debt_subtotal = None |
|
total_stcg = None |
|
total_ltcg = None |
|
all_transactions = [] |
|
|
|
fund_types = { |
|
'Equity': 'MF (Equity)', |
|
'Others': 'MF (Equity)', # Index funds are marked "Others" in Kuvera's report, for some reason |
|
'Debt': 'MF (Other than Equity)' |
|
} |
|
|
|
for row in capital_gains_tbody.select('tr'): |
|
columns = row.select('td') |
|
if len(columns) == 1: |
|
# This row contains either the ISIN or the Folio number |
|
column_data = columns[0].string |
|
isin_match = isin_pattern.search(column_data) |
|
folio_match = folio_pattern.search(column_data) |
|
if isin_match: |
|
current_isin = isin_match.group(1) |
|
elif folio_match: |
|
current_folio = folio_match.group(1) |
|
|
|
# Extract fund name |
|
fund_name_match = fund_name_pattern.search(column_data) |
|
if fund_name_match: |
|
current_fund_name = fund_name_match.group(1) |
|
|
|
for fund_type in fund_types: |
|
if fund_type in column_data: |
|
current_fund_type = fund_types[fund_type] |
|
break |
|
elif len(columns) == 10: |
|
# This row contains a transaction |
|
txn_data = [col.string for col in columns] |
|
transaction = Transaction(current_fund_name, current_fund_type, current_isin, current_folio, txn_data) |
|
all_transactions.append(transaction) |
|
elif len(columns) == 8: |
|
# Summary of transactions for a fund, we can skip these rows |
|
pass |
|
elif len(columns) == 3: |
|
row_title = columns[0].string |
|
if row_title == 'Total': |
|
total_stcg = float(columns[1].string.split()[1].replace(',', '')) |
|
total_ltcg = float(columns[2].string.split()[1].replace(',', '')) |
|
|
|
return (all_transactions, total_stcg, total_ltcg) |
|
|
|
def write_capital_gains_report(all_transactions, output_xlsx_file, cleartax_template_xlsx_file): |
|
|
|
wb = load_workbook(cleartax_template_xlsx_file) |
|
|
|
# We only populate data in the Mutual Fund Sheet, which is the second one in the template |
|
mf_sheet = wb.worksheets[1] |
|
|
|
# Columns 9, 11 and 13 are computed from the other columns |
|
columns_to_update = [1, 2, 3, 4, 5, 6, 7, 8, 10, 12] |
|
column_types = [ |
|
'string', |
|
'string', |
|
'string', |
|
'float', |
|
'date', |
|
'float', |
|
'date', |
|
'float', |
|
'float', |
|
'string' |
|
] |
|
for i in range(len(all_transactions)): |
|
txn_data = all_transactions[i].get_data() |
|
for (val, j, col_type) in zip(txn_data, columns_to_update, column_types): |
|
cell = mf_sheet.cell(row=i+2, column=j) |
|
if val is not None: |
|
cell.value = val |
|
if col_type == 'float': |
|
cell.number_format = FORMAT_NUMBER_00 |
|
|
|
wb.save(output_xlsx_file) |
|
|
|
def prepare(capital_gains_xls_file, cleartax_template_xlsx_file, output_xlsx_file): |
|
|
|
all_transactions, total_stcg, total_ltcg = read_transactions(capital_gains_xls_file) |
|
|
|
stcg_sum = sum([txn.stcg for txn in all_transactions if txn.stcg is not None]) |
|
ltcg_sum = sum([txn.ltcg for txn in all_transactions if txn.ltcg is not None]) |
|
print('Sum of all STCG across all transactions:', stcg_sum) |
|
print('Total STCG from report:', total_stcg) |
|
print('Sum of all LTCG across all transactions:', ltcg_sum) |
|
print('Total LTCG from report:', total_ltcg) |
|
|
|
write_capital_gains_report(all_transactions, output_xlsx_file, cleartax_template_xlsx_file) |
|
|
|
if __name__ == '__main__': |
|
prepare(sys.argv[1], sys.argv[2], sys.argv[3]) |
The first row value with fund name and ISIN does not have a closing
</tr>
It is commented out in the soup parsed output. And so the script is unable to pick up the value within this first td
Everything else gets picked up correctly. Do you know how to solve that? @foulegg