Skip to content

Instantly share code, notes, and snippets.

View abhijeet-talaulikar's full-sized avatar

Abhijeet Talaulikar abhijeet-talaulikar

View GitHub Profile
import numpy as np
import pandas as pd
import csv
import random
import string
### Create full dataset of 5 million credit card customers ###
def generate_cust_id():
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=7))
import pandas as pd
import numpy as np
import pandas as pd
import swifter
import cleantext
# Load finance headlines from the dataset
data = pd.read_csv("raw_partner_headlines.csv", usecols=["headline"])
# Perform basic preprocessing steps using cleantext
from finbert_embedding.embedding import FinbertEmbedding
# Convert text to vectors using pretrained finbert embeddings
finbert = FinbertEmbedding()
embeddings = np.array([finbert.sentence_vector(i).numpy() for i in headline_texts])
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
# Use Hierarchical DBSCAN as clustering model
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Use UMAP for dimensionality reduction
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Intertopic distance map
topic_model.visualize_topics()
# Topic similarities
topic_model.visualize_heatmap()
# Leading keywords in topics
topic_model.visualize_barchart()
# Term score trends
# Add GPT 3.5 representation model into BERTopic
openai.api_key = "sk-xxx"
representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
topic_model = BERTopic(
verbose=True,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
nr_topics=10,
umap_model=umap_model
### Setup the page configuration ###
st.set_page_config(layout="wide", page_title="Concall Transcripts with GPT")
st.title('Concall Transcripts with GPT')
openai_api_key = st.sidebar.text_input('OpenAI API Key', value='')
if "response_default" not in st.session_state:
st.session_state['response_default'] = None
### Create UI elements ###
if "disabled" not in st.session_state:
### Read text from PDF file ###
def read_pdf(file):
loader = OnlinePDFLoader(file)
documents = loader.load()
return documents
prompt_default = '''
This document is the transcript of an earnings conference call of a company. Assume you are an analyst who
attended this call. Identify which company this document is talking about. Identify 10 best questions and their
answers that would help summarize the company's performance.
Create a report in a markdown format that answers each of those 10 questions. Here is an example of the format.
Example
## Insert company name here
### Query function for the app ###
def get_guery_function(documents):
# Text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=10)
texts = text_splitter.split_documents(documents)
# Embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectordb = Chroma.from_documents(documents=texts,
embedding=embeddings,