Abhijeet Talaulikar abhijeet-talaulikar

Data Scientist @ Best Buy

abhijeet-talaulikar / causalml-create-credit-card-data.py

Created July 12, 2023 01:11

	import numpy as np
	import pandas as pd
	import csv
	import random
	import string

	### Create full dataset of 5 million credit card customers ###

	def generate_cust_id():
	return ''.join(random.choices(string.ascii_uppercase + string.digits, k=7))

abhijeet-talaulikar / clustering-fin-news-dataclean.py

Created July 28, 2023 21:29

	import pandas as pd
	import numpy as np
	import pandas as pd
	import swifter
	import cleantext

	# Load finance headlines from the dataset
	data = pd.read_csv("raw_partner_headlines.csv", usecols=["headline"])

	# Perform basic preprocessing steps using cleantext

abhijeet-talaulikar / clustering-fin-news-finbert.py

Created July 28, 2023 21:33

	from finbert_embedding.embedding import FinbertEmbedding

	# Convert text to vectors using pretrained finbert embeddings
	finbert = FinbertEmbedding()
	embeddings = np.array([finbert.sentence_vector(i).numpy() for i in headline_texts])

abhijeet-talaulikar / clustering-fin-news-bertopic.py

Last active July 28, 2023 23:22

	from bertopic import BERTopic
	from umap import UMAP
	from hdbscan import HDBSCAN

	# Use Hierarchical DBSCAN as clustering model
	hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

	# Use UMAP for dimensionality reduction
	umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

abhijeet-talaulikar / clustering-fin-news-viz.py

Created July 28, 2023 21:54

	# Intertopic distance map
	topic_model.visualize_topics()

	# Topic similarities
	topic_model.visualize_heatmap()

	# Leading keywords in topics
	topic_model.visualize_barchart()

	# Term score trends

abhijeet-talaulikar / clustering-fin-news-bertopic-gpt.py

Created July 29, 2023 00:27

	# Add GPT 3.5 representation model into BERTopic
	openai.api_key = "sk-xxx"
	representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)

	topic_model = BERTopic(
	verbose=True,
	hdbscan_model=hdbscan_model,
	representation_model=representation_model,
	nr_topics=10,
	umap_model=umap_model

abhijeet-talaulikar / gpt-concall-ui-default.py

Last active August 7, 2023 13:08

	### Setup the page configuration ###
	st.set_page_config(layout="wide", page_title="Concall Transcripts with GPT")
	st.title('Concall Transcripts with GPT')
	openai_api_key = st.sidebar.text_input('OpenAI API Key', value='')
	if "response_default" not in st.session_state:
	st.session_state['response_default'] = None


	### Create UI elements ###
	if "disabled" not in st.session_state:

abhijeet-talaulikar / gpt-concall-loadpdf.py

Created August 6, 2023 14:24

	### Read text from PDF file ###
	def read_pdf(file):
	loader = OnlinePDFLoader(file)
	documents = loader.load()
	return documents

abhijeet-talaulikar / gpt-concall-prompt.py

Last active August 7, 2023 13:07

	prompt_default = '''
	This document is the transcript of an earnings conference call of a company. Assume you are an analyst who
	attended this call. Identify which company this document is talking about. Identify 10 best questions and their
	answers that would help summarize the company's performance.
	Create a report in a markdown format that answers each of those 10 questions. Here is an example of the format.

	Example

	## Insert company name here

abhijeet-talaulikar / gpt-concall-queryfun.py

Last active August 6, 2023 15:16

	### Query function for the app ###
	def get_guery_function(documents):
	# Text splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=10)
	texts = text_splitter.split_documents(documents)

	# Embeddings
	embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
	vectordb = Chroma.from_documents(documents=texts,
	embedding=embeddings,