kkrishnan90 · October 13, 2022 09:04
diff --git a/single-page-report.py b/single-page-report.py

 import pandas as pd
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
 from matplotlib.backends.backend_pdf import PdfPages
 import os, time
 from os import path
 from google.cloud import storage
 import re
 import pdfplumber
 import datetime
 from collections import namedtuple
 import functions_framework
 import tempfile
 import re
 import datetime

 page_lines=[]
 final_lines=[]
 Row = namedtuple('Row',['service_name','region','description','price','page_number'])
 service_margin=''
 service_margin_index=0
 words=[]
 dfs = {}


 def extract_line_items(pdf_file_path, csv_file_path, base_file_name):
    temp_output_path = '/tmp/awsvsgcp.pdf'
    temp_dir_path = '/tmp/'
    print('Temp output pdf report is saved at {}'.format(temp_output_path))
    report = PdfPages(temp_output_path) 
    
    #Create invoice breakup chart
    with pdfplumber.open(pdf_file_path) as pdf:
        pages = pdf.pages 
        df=pd.DataFrame()
        for page in pages: 
            words_extracted = page.extract_words(extra_attrs=['fontname','size','page_number'])
            page_lines.append(words_extracted)
        pdf.close() 
    for idx,lines in enumerate(page_lines):
        dfs[idx]=pd.DataFrame(lines)
        df = pd.concat([dfs[idx] for idx, df in enumerate(dfs)])
        df2=df.groupby(['page_number','bottom'],as_index=False).agg({'x0':'first','text':lambda x: ' '.join(x),'fontname':'first','size':'first'})
    for idx,row in df2.iterrows():
        if 'Details' in row['text']:
            service_margin=df2['x0'][idx+2]
            service_margin_index = idx+2
            print('Service Margin:{},{}'.format(service_margin, service_margin_index))
    for idx,row in df2.loc[service_margin_index:].iterrows():    
        page_number = row['page_number']
        if row['x0']==service_margin:
            service_name = row['text'].split('$')[0].strip()
            region_name=''
            description=''
            price=float(row['text'].replace(',','').split('$')[-1])
        final_lines.append(Row(service_name=service_name,region=region_name, description=description, price=price,page_number=page_number))

    df3 = pd.DataFrame(final_lines)
    df3.drop_duplicates(subset=['service_name'],keep='first',inplace=True)
    df3.reset_index(inplace=True,drop=True)
    df3.sort_values('price',ascending=False,inplace=True)

    # Setup figures and axes
    fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(40, 40))
    #Create pie chart
    
    price_np_array = df3['price'].to_numpy()
    service_np_array = df3['service_name'].to_numpy()
    pie_figure = plt.figure(figsize=(40,5))
    # pie_figure.suptitle('AWS Bill Breakup', fontsize=22,y=1,)
    title = ax[0].set_title('AWS Bill Breakup', fontsize=22, fontweight='bold')    
    title.set_ha("center")
    percents = df3['price'] * 100 / price_np_array.sum()
    labels = ['%s, %1.1f %%' % (l, s) for l, s in zip(service_np_array, percents)]
    patches, texts = ax[0].pie(price_np_array,startangle=0,radius=1.2)
    ax[0].legend(patches,labels,title="Services",loc="upper right",fontsize=13,bbox_to_anchor=(0,0.8),ncol=3,)
    pie_figure.subplots_adjust(left=2, right=3)

    # Create service level breakup comparison chart between AWS & GCP
    df = pd.read_csv(csv_file_path)
    df_s3 = df.query('ProductCode=="AmazonS3"')
    df_ec2 = df.query('ProductCode=="AmazonEC2"')
    df_rds = df.query('ProductCode=="AmazonRDS"')

    if not df_s3.empty:
        # Create plots for S3
        df_s3_aws_cost=df_s3['Source Cost']
        df_s3_product_type=df_s3['ProductType']
        df_s3_gcp_cost=df_s3['GCP Cost']
        N_S3=len(df_s3_product_type)
        ind_s3 = np.arange(N_S3)
        s3_fig=plt.figure(figsize=(40,5))
        width = 0.3       
        ax[1].bar(ind_s3, df_s3_aws_cost , width, label='AWS',color='orange')
        ax[1].bar(ind_s3 + width, df_s3_gcp_cost, width, label='GCP',color='royalblue')
        ax[1].set_xlabel('Product Type')
        ax[1].set_ylabel('Price(in USD)')
        ax[1].set_title('AWS S3 vs GCP GCS Cost Comparison',fontsize=22, fontweight='bold')
        ax[1].set_xticks(ind_s3 + width / 2, df_s3_product_type)
        ax[1].legend(loc='best')
        
        
        # report.savefig(s3_fig, bbox_inches='tight')

    if not df_ec2.empty:
        # Create plots for EC2
        df_ec2_aws_cost=df_ec2['Source Cost']
        df_ec2_product_type=df_ec2['ProductType'].str.split(':').str[1]
        df_gce_gcp_cost=df_ec2['GCP Cost']
        N_EC2 = len(df_ec2_product_type)
        ind_ec2 = np.arange(N_EC2)
        ec2_fig=plt.figure(figsize=(40,5))
        width = 0.3       
        ax[2].bar(ind_ec2, df_ec2_aws_cost , width, label='AWS',color='orange')
        ax[2].bar(ind_ec2 + width, df_gce_gcp_cost, width, label='GCP',color='royalblue')
        ax[2].set_xlabel('Product Type')
        ax[2].set_ylabel('Price(in USD)')
        ax[2].set_title('AWS EC2 vs GCP GCE Cost Comparison',fontsize=22, fontweight='bold')
        ax[2].set_xticks(ind_ec2 + width / 2, df_ec2_product_type)
        ax[2].legend(loc='best')
        # report.savefig(ec2_fig, bbox_inches='tight')

    if not df_rds.empty:
        # Create plots for RDS
        df_rds_aws_cost=df_rds['Source Cost']
        df_rds_product_type=df_rds['ProductType'].str.split(':').str[1]
        df_csql_gcp_cost=df_rds['GCP Cost']
        N_RDS = len(df_rds_product_type)
        ind_rds = np.arange(N_RDS)
        rds_fig=plt.figure(figsize=(40,5))
        width = 0.3       
        ax[3].bar(ind_rds, df_rds_aws_cost , width, label='AWS',color='orange')
        ax[3].bar(ind_rds + width, df_csql_gcp_cost, width, label='GCP',color='royalblue')
        ax[3].set_xlabel('Product Type')
        ax[3].set_ylabel('Price(in USD)')
        ax[3].set_title('AWS RDS vs GCP Cloud SQL Cost Comparison', fontsize=22, fontweight='bold')
        ax[3].set_xticks(ind_rds + width / 2, df_rds_product_type)
        ax[3].legend(loc='best')
        # report.savefig(rds_fig, bbox_inches='tight')  
    report.savefig(fig)               
    report.close()
    print('Listing files in dir : {}'.format(os.listdir(temp_dir_path)))
    while not os.path.exists(temp_output_path):
        print('Checking file if exists')
        time.sleep(5)
    if os.path.isfile(temp_output_path):
        print('File exists!!!!')
        root = path.dirname(path.abspath(__file__))   
        return path.join(root,temp_output_path)
    return ""


 @functions_framework.cloud_event
 def extract(cloud_event):
  data = cloud_event.data

  event_id = cloud_event["id"]
  event_type = cloud_event["type"]
  bucket = data["bucket"]
  name = data["name"]
  metageneration = data["metageneration"]
  timeCreated = data["timeCreated"]
  updated = data["updated"]
  
  print(f"Event Data:{data}")
  print(f"Event ID: {event_id}")
  print(f"Event type: {event_type}")
  print(f"Bucket: {bucket}")
  print(f"File: {name}")
  print(f"Metageneration: {metageneration}")
  print(f"Created: {timeCreated}")
  print(f"Updated: {updated}")
  
  object_path = str('gs://' + bucket + '/' + name)
  m = re.findall(r"(user=.*)\/(t=.*)\/(.*)$", name)
  user_part = m[0][0]
  time_part = m[0][1]
  file_name = m[0][2]
  file_prefix = user_part+"/"+time_part
  base_file_name = file_name.split('.')[0]
  username = user_part.split('=')[1]
  ui_timestamp = time_part.split('=')[1]  

  print(f"Adding new request for processing {object_path}.")

  report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')
  pdf_file_path = "/tmp/"+file_name
  download_blob_to_file(bucket, name, pdf_file_path)
  local_report_location = extract_line_items(pdf_file_path=pdf_file_path,csv_file_path=,base_file_name=base_file_name)
  if local_report_location:
    gcs_uri = upload_blob_from_filename(file_prefix, local_report_location)


 def download_blob_to_file(bucket_name, source_blob_name, file_path):
    """Downloads a blob from the bucket to Local File"""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(file_path) 

 def upload_blob_from_filename(file_prefix, f): 
    report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')    
    file_name = file_prefix + "/" + f.split('/')[-1]
    print('File name inside upload blob is {}'.format(file_name))
    print('File inside upload blob is {}'.format(f))
    storage_client = storage.Client()
    bucket = storage_client.bucket(report_output_bucket)
    blob = bucket.blob(file_name)
    print('Before upload begins Listing files in dir : {}'.format(os.listdir('/tmp/')))
    blob.upload_from_filename(f)
    print(f"Successfully Done!!!")
    return "gs://"+report_output_bucket+"/"+file_name

	import pandas as pd
	import numpy as np
	import matplotlib
	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages
	import os, time
	from os import path
	from google.cloud import storage
	import re
	import pdfplumber
	import datetime
	from collections import namedtuple
	import functions_framework
	import tempfile
	import re
	import datetime

	page_lines=[]
	final_lines=[]
	Row = namedtuple('Row',['service_name','region','description','price','page_number'])
	service_margin=''
	service_margin_index=0
	words=[]
	dfs = {}


	def extract_line_items(pdf_file_path, csv_file_path, base_file_name):
	temp_output_path = '/tmp/awsvsgcp.pdf'
	temp_dir_path = '/tmp/'
	print('Temp output pdf report is saved at {}'.format(temp_output_path))
	report = PdfPages(temp_output_path)

	#Create invoice breakup chart
	with pdfplumber.open(pdf_file_path) as pdf:
	pages = pdf.pages
	df=pd.DataFrame()
	for page in pages:
	words_extracted = page.extract_words(extra_attrs=['fontname','size','page_number'])
	page_lines.append(words_extracted)
	pdf.close()
	for idx,lines in enumerate(page_lines):
	dfs[idx]=pd.DataFrame(lines)
	df = pd.concat([dfs[idx] for idx, df in enumerate(dfs)])
	df2=df.groupby(['page_number','bottom'],as_index=False).agg({'x0':'first','text':lambda x: ' '.join(x),'fontname':'first','size':'first'})
	for idx,row in df2.iterrows():
	if 'Details' in row['text']:
	service_margin=df2['x0'][idx+2]
	service_margin_index = idx+2
	print('Service Margin:{},{}'.format(service_margin, service_margin_index))
	for idx,row in df2.loc[service_margin_index:].iterrows():
	page_number = row['page_number']
	if row['x0']==service_margin:
	service_name = row['text'].split('$')[0].strip()
	region_name=''
	description=''
	price=float(row['text'].replace(',','').split('$')[-1])
	final_lines.append(Row(service_name=service_name,region=region_name, description=description, price=price,page_number=page_number))

	df3 = pd.DataFrame(final_lines)
	df3.drop_duplicates(subset=['service_name'],keep='first',inplace=True)
	df3.reset_index(inplace=True,drop=True)
	df3.sort_values('price',ascending=False,inplace=True)

	# Setup figures and axes
	fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(40, 40))
	#Create pie chart

	price_np_array = df3['price'].to_numpy()
	service_np_array = df3['service_name'].to_numpy()
	pie_figure = plt.figure(figsize=(40,5))
	# pie_figure.suptitle('AWS Bill Breakup', fontsize=22,y=1,)
	title = ax[0].set_title('AWS Bill Breakup', fontsize=22, fontweight='bold')
	title.set_ha("center")
	percents = df3['price'] * 100 / price_np_array.sum()
	labels = ['%s, %1.1f %%' % (l, s) for l, s in zip(service_np_array, percents)]
	patches, texts = ax[0].pie(price_np_array,startangle=0,radius=1.2)
	ax[0].legend(patches,labels,title="Services",loc="upper right",fontsize=13,bbox_to_anchor=(0,0.8),ncol=3,)
	pie_figure.subplots_adjust(left=2, right=3)

	# Create service level breakup comparison chart between AWS & GCP
	df = pd.read_csv(csv_file_path)
	df_s3 = df.query('ProductCode=="AmazonS3"')
	df_ec2 = df.query('ProductCode=="AmazonEC2"')
	df_rds = df.query('ProductCode=="AmazonRDS"')

	if not df_s3.empty:
	# Create plots for S3
	df_s3_aws_cost=df_s3['Source Cost']
	df_s3_product_type=df_s3['ProductType']
	df_s3_gcp_cost=df_s3['GCP Cost']
	N_S3=len(df_s3_product_type)
	ind_s3 = np.arange(N_S3)
	s3_fig=plt.figure(figsize=(40,5))
	width = 0.3
	ax[1].bar(ind_s3, df_s3_aws_cost , width, label='AWS',color='orange')
	ax[1].bar(ind_s3 + width, df_s3_gcp_cost, width, label='GCP',color='royalblue')
	ax[1].set_xlabel('Product Type')
	ax[1].set_ylabel('Price(in USD)')
	ax[1].set_title('AWS S3 vs GCP GCS Cost Comparison',fontsize=22, fontweight='bold')
	ax[1].set_xticks(ind_s3 + width / 2, df_s3_product_type)
	ax[1].legend(loc='best')


	# report.savefig(s3_fig, bbox_inches='tight')

	if not df_ec2.empty:
	# Create plots for EC2
	df_ec2_aws_cost=df_ec2['Source Cost']
	df_ec2_product_type=df_ec2['ProductType'].str.split(':').str[1]
	df_gce_gcp_cost=df_ec2['GCP Cost']
	N_EC2 = len(df_ec2_product_type)
	ind_ec2 = np.arange(N_EC2)
	ec2_fig=plt.figure(figsize=(40,5))
	width = 0.3
	ax[2].bar(ind_ec2, df_ec2_aws_cost , width, label='AWS',color='orange')
	ax[2].bar(ind_ec2 + width, df_gce_gcp_cost, width, label='GCP',color='royalblue')
	ax[2].set_xlabel('Product Type')
	ax[2].set_ylabel('Price(in USD)')
	ax[2].set_title('AWS EC2 vs GCP GCE Cost Comparison',fontsize=22, fontweight='bold')
	ax[2].set_xticks(ind_ec2 + width / 2, df_ec2_product_type)
	ax[2].legend(loc='best')
	# report.savefig(ec2_fig, bbox_inches='tight')

	if not df_rds.empty:
	# Create plots for RDS
	df_rds_aws_cost=df_rds['Source Cost']
	df_rds_product_type=df_rds['ProductType'].str.split(':').str[1]
	df_csql_gcp_cost=df_rds['GCP Cost']
	N_RDS = len(df_rds_product_type)
	ind_rds = np.arange(N_RDS)
	rds_fig=plt.figure(figsize=(40,5))
	width = 0.3
	ax[3].bar(ind_rds, df_rds_aws_cost , width, label='AWS',color='orange')
	ax[3].bar(ind_rds + width, df_csql_gcp_cost, width, label='GCP',color='royalblue')
	ax[3].set_xlabel('Product Type')
	ax[3].set_ylabel('Price(in USD)')
	ax[3].set_title('AWS RDS vs GCP Cloud SQL Cost Comparison', fontsize=22, fontweight='bold')
	ax[3].set_xticks(ind_rds + width / 2, df_rds_product_type)
	ax[3].legend(loc='best')
	# report.savefig(rds_fig, bbox_inches='tight')
	report.savefig(fig)
	report.close()
	print('Listing files in dir : {}'.format(os.listdir(temp_dir_path)))
	while not os.path.exists(temp_output_path):
	print('Checking file if exists')
	time.sleep(5)
	if os.path.isfile(temp_output_path):
	print('File exists!!!!')
	root = path.dirname(path.abspath(__file__))
	return path.join(root,temp_output_path)
	return ""


	@functions_framework.cloud_event
	def extract(cloud_event):
	data = cloud_event.data

	event_id = cloud_event["id"]
	event_type = cloud_event["type"]
	bucket = data["bucket"]
	name = data["name"]
	metageneration = data["metageneration"]
	timeCreated = data["timeCreated"]
	updated = data["updated"]

	print(f"Event Data:{data}")
	print(f"Event ID: {event_id}")
	print(f"Event type: {event_type}")
	print(f"Bucket: {bucket}")
	print(f"File: {name}")
	print(f"Metageneration: {metageneration}")
	print(f"Created: {timeCreated}")
	print(f"Updated: {updated}")

	object_path = str('gs://' + bucket + '/' + name)
	m = re.findall(r"(user=.)\/(t=.)\/(.*)$", name)
	user_part = m[0][0]
	time_part = m[0][1]
	file_name = m[0][2]
	file_prefix = user_part+"/"+time_part
	base_file_name = file_name.split('.')[0]
	username = user_part.split('=')[1]
	ui_timestamp = time_part.split('=')[1]

	print(f"Adding new request for processing {object_path}.")

	report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')
	pdf_file_path = "/tmp/"+file_name
	download_blob_to_file(bucket, name, pdf_file_path)
	local_report_location = extract_line_items(pdf_file_path=pdf_file_path,csv_file_path=,base_file_name=base_file_name)
	if local_report_location:
	gcs_uri = upload_blob_from_filename(file_prefix, local_report_location)


	def download_blob_to_file(bucket_name, source_blob_name, file_path):
	"""Downloads a blob from the bucket to Local File"""
	storage_client = storage.Client()
	bucket = storage_client.bucket(bucket_name)
	blob = bucket.blob(source_blob_name)
	blob.download_to_filename(file_path)

	def upload_blob_from_filename(file_prefix, f):
	report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')
	file_name = file_prefix + "/" + f.split('/')[-1]
	print('File name inside upload blob is {}'.format(file_name))
	print('File inside upload blob is {}'.format(f))
	storage_client = storage.Client()
	bucket = storage_client.bucket(report_output_bucket)
	blob = bucket.blob(file_name)
	print('Before upload begins Listing files in dir : {}'.format(os.listdir('/tmp/')))
	blob.upload_from_filename(f)
	print(f"Successfully Done!!!")
	return "gs://"+report_output_bucket+"/"+file_name