Created
October 13, 2022 09:04
-
-
Save kkrishnan90/95ac94e8664ccdef719901b795a15921 to your computer and use it in GitHub Desktop.
single-page-report-aws
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
from matplotlib.backends.backend_pdf import PdfPages | |
import os, time | |
from os import path | |
from google.cloud import storage | |
import re | |
import pdfplumber | |
import datetime | |
from collections import namedtuple | |
import functions_framework | |
import tempfile | |
import re | |
import datetime | |
page_lines=[] | |
final_lines=[] | |
Row = namedtuple('Row',['service_name','region','description','price','page_number']) | |
service_margin='' | |
service_margin_index=0 | |
words=[] | |
dfs = {} | |
def extract_line_items(pdf_file_path, csv_file_path, base_file_name): | |
temp_output_path = '/tmp/awsvsgcp.pdf' | |
temp_dir_path = '/tmp/' | |
print('Temp output pdf report is saved at {}'.format(temp_output_path)) | |
report = PdfPages(temp_output_path) | |
#Create invoice breakup chart | |
with pdfplumber.open(pdf_file_path) as pdf: | |
pages = pdf.pages | |
df=pd.DataFrame() | |
for page in pages: | |
words_extracted = page.extract_words(extra_attrs=['fontname','size','page_number']) | |
page_lines.append(words_extracted) | |
pdf.close() | |
for idx,lines in enumerate(page_lines): | |
dfs[idx]=pd.DataFrame(lines) | |
df = pd.concat([dfs[idx] for idx, df in enumerate(dfs)]) | |
df2=df.groupby(['page_number','bottom'],as_index=False).agg({'x0':'first','text':lambda x: ' '.join(x),'fontname':'first','size':'first'}) | |
for idx,row in df2.iterrows(): | |
if 'Details' in row['text']: | |
service_margin=df2['x0'][idx+2] | |
service_margin_index = idx+2 | |
print('Service Margin:{},{}'.format(service_margin, service_margin_index)) | |
for idx,row in df2.loc[service_margin_index:].iterrows(): | |
page_number = row['page_number'] | |
if row['x0']==service_margin: | |
service_name = row['text'].split('$')[0].strip() | |
region_name='' | |
description='' | |
price=float(row['text'].replace(',','').split('$')[-1]) | |
final_lines.append(Row(service_name=service_name,region=region_name, description=description, price=price,page_number=page_number)) | |
df3 = pd.DataFrame(final_lines) | |
df3.drop_duplicates(subset=['service_name'],keep='first',inplace=True) | |
df3.reset_index(inplace=True,drop=True) | |
df3.sort_values('price',ascending=False,inplace=True) | |
# Setup figures and axes | |
fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(40, 40)) | |
#Create pie chart | |
price_np_array = df3['price'].to_numpy() | |
service_np_array = df3['service_name'].to_numpy() | |
pie_figure = plt.figure(figsize=(40,5)) | |
# pie_figure.suptitle('AWS Bill Breakup', fontsize=22,y=1,) | |
title = ax[0].set_title('AWS Bill Breakup', fontsize=22, fontweight='bold') | |
title.set_ha("center") | |
percents = df3['price'] * 100 / price_np_array.sum() | |
labels = ['%s, %1.1f %%' % (l, s) for l, s in zip(service_np_array, percents)] | |
patches, texts = ax[0].pie(price_np_array,startangle=0,radius=1.2) | |
ax[0].legend(patches,labels,title="Services",loc="upper right",fontsize=13,bbox_to_anchor=(0,0.8),ncol=3,) | |
pie_figure.subplots_adjust(left=2, right=3) | |
# Create service level breakup comparison chart between AWS & GCP | |
df = pd.read_csv(csv_file_path) | |
df_s3 = df.query('ProductCode=="AmazonS3"') | |
df_ec2 = df.query('ProductCode=="AmazonEC2"') | |
df_rds = df.query('ProductCode=="AmazonRDS"') | |
if not df_s3.empty: | |
# Create plots for S3 | |
df_s3_aws_cost=df_s3['Source Cost'] | |
df_s3_product_type=df_s3['ProductType'] | |
df_s3_gcp_cost=df_s3['GCP Cost'] | |
N_S3=len(df_s3_product_type) | |
ind_s3 = np.arange(N_S3) | |
s3_fig=plt.figure(figsize=(40,5)) | |
width = 0.3 | |
ax[1].bar(ind_s3, df_s3_aws_cost , width, label='AWS',color='orange') | |
ax[1].bar(ind_s3 + width, df_s3_gcp_cost, width, label='GCP',color='royalblue') | |
ax[1].set_xlabel('Product Type') | |
ax[1].set_ylabel('Price(in USD)') | |
ax[1].set_title('AWS S3 vs GCP GCS Cost Comparison',fontsize=22, fontweight='bold') | |
ax[1].set_xticks(ind_s3 + width / 2, df_s3_product_type) | |
ax[1].legend(loc='best') | |
# report.savefig(s3_fig, bbox_inches='tight') | |
if not df_ec2.empty: | |
# Create plots for EC2 | |
df_ec2_aws_cost=df_ec2['Source Cost'] | |
df_ec2_product_type=df_ec2['ProductType'].str.split(':').str[1] | |
df_gce_gcp_cost=df_ec2['GCP Cost'] | |
N_EC2 = len(df_ec2_product_type) | |
ind_ec2 = np.arange(N_EC2) | |
ec2_fig=plt.figure(figsize=(40,5)) | |
width = 0.3 | |
ax[2].bar(ind_ec2, df_ec2_aws_cost , width, label='AWS',color='orange') | |
ax[2].bar(ind_ec2 + width, df_gce_gcp_cost, width, label='GCP',color='royalblue') | |
ax[2].set_xlabel('Product Type') | |
ax[2].set_ylabel('Price(in USD)') | |
ax[2].set_title('AWS EC2 vs GCP GCE Cost Comparison',fontsize=22, fontweight='bold') | |
ax[2].set_xticks(ind_ec2 + width / 2, df_ec2_product_type) | |
ax[2].legend(loc='best') | |
# report.savefig(ec2_fig, bbox_inches='tight') | |
if not df_rds.empty: | |
# Create plots for RDS | |
df_rds_aws_cost=df_rds['Source Cost'] | |
df_rds_product_type=df_rds['ProductType'].str.split(':').str[1] | |
df_csql_gcp_cost=df_rds['GCP Cost'] | |
N_RDS = len(df_rds_product_type) | |
ind_rds = np.arange(N_RDS) | |
rds_fig=plt.figure(figsize=(40,5)) | |
width = 0.3 | |
ax[3].bar(ind_rds, df_rds_aws_cost , width, label='AWS',color='orange') | |
ax[3].bar(ind_rds + width, df_csql_gcp_cost, width, label='GCP',color='royalblue') | |
ax[3].set_xlabel('Product Type') | |
ax[3].set_ylabel('Price(in USD)') | |
ax[3].set_title('AWS RDS vs GCP Cloud SQL Cost Comparison', fontsize=22, fontweight='bold') | |
ax[3].set_xticks(ind_rds + width / 2, df_rds_product_type) | |
ax[3].legend(loc='best') | |
# report.savefig(rds_fig, bbox_inches='tight') | |
report.savefig(fig) | |
report.close() | |
print('Listing files in dir : {}'.format(os.listdir(temp_dir_path))) | |
while not os.path.exists(temp_output_path): | |
print('Checking file if exists') | |
time.sleep(5) | |
if os.path.isfile(temp_output_path): | |
print('File exists!!!!') | |
root = path.dirname(path.abspath(__file__)) | |
return path.join(root,temp_output_path) | |
return "" | |
@functions_framework.cloud_event | |
def extract(cloud_event): | |
data = cloud_event.data | |
event_id = cloud_event["id"] | |
event_type = cloud_event["type"] | |
bucket = data["bucket"] | |
name = data["name"] | |
metageneration = data["metageneration"] | |
timeCreated = data["timeCreated"] | |
updated = data["updated"] | |
print(f"Event Data:{data}") | |
print(f"Event ID: {event_id}") | |
print(f"Event type: {event_type}") | |
print(f"Bucket: {bucket}") | |
print(f"File: {name}") | |
print(f"Metageneration: {metageneration}") | |
print(f"Created: {timeCreated}") | |
print(f"Updated: {updated}") | |
object_path = str('gs://' + bucket + '/' + name) | |
m = re.findall(r"(user=.*)\/(t=.*)\/(.*)$", name) | |
user_part = m[0][0] | |
time_part = m[0][1] | |
file_name = m[0][2] | |
file_prefix = user_part+"/"+time_part | |
base_file_name = file_name.split('.')[0] | |
username = user_part.split('=')[1] | |
ui_timestamp = time_part.split('=')[1] | |
print(f"Adding new request for processing {object_path}.") | |
report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.') | |
pdf_file_path = "/tmp/"+file_name | |
download_blob_to_file(bucket, name, pdf_file_path) | |
local_report_location = extract_line_items(pdf_file_path=pdf_file_path,csv_file_path=,base_file_name=base_file_name) | |
if local_report_location: | |
gcs_uri = upload_blob_from_filename(file_prefix, local_report_location) | |
def download_blob_to_file(bucket_name, source_blob_name, file_path): | |
"""Downloads a blob from the bucket to Local File""" | |
storage_client = storage.Client() | |
bucket = storage_client.bucket(bucket_name) | |
blob = bucket.blob(source_blob_name) | |
blob.download_to_filename(file_path) | |
def upload_blob_from_filename(file_prefix, f): | |
report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.') | |
file_name = file_prefix + "/" + f.split('/')[-1] | |
print('File name inside upload blob is {}'.format(file_name)) | |
print('File inside upload blob is {}'.format(f)) | |
storage_client = storage.Client() | |
bucket = storage_client.bucket(report_output_bucket) | |
blob = bucket.blob(file_name) | |
print('Before upload begins Listing files in dir : {}'.format(os.listdir('/tmp/'))) | |
blob.upload_from_filename(f) | |
print(f"Successfully Done!!!") | |
return "gs://"+report_output_bucket+"/"+file_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment