This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import csv | |
import random | |
import string | |
### Create full dataset of 5 million credit card customers ### | |
def generate_cust_id(): | |
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=7)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import pandas as pd | |
import swifter | |
import cleantext | |
# Load finance headlines from the dataset | |
data = pd.read_csv("raw_partner_headlines.csv", usecols=["headline"]) | |
# Perform basic preprocessing steps using cleantext |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from finbert_embedding.embedding import FinbertEmbedding | |
# Convert text to vectors using pretrained finbert embeddings | |
finbert = FinbertEmbedding() | |
embeddings = np.array([finbert.sentence_vector(i).numpy() for i in headline_texts]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bertopic import BERTopic | |
from umap import UMAP | |
from hdbscan import HDBSCAN | |
# Use Hierarchical DBSCAN as clustering model | |
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True) | |
# Use UMAP for dimensionality reduction | |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Intertopic distance map | |
topic_model.visualize_topics() | |
# Topic similarities | |
topic_model.visualize_heatmap() | |
# Leading keywords in topics | |
topic_model.visualize_barchart() | |
# Term score trends |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add GPT 3.5 representation model into BERTopic | |
openai.api_key = "sk-xxx" | |
representation_model = OpenAI(model="gpt-3.5-turbo", chat=True) | |
topic_model = BERTopic( | |
verbose=True, | |
hdbscan_model=hdbscan_model, | |
representation_model=representation_model, | |
nr_topics=10, | |
umap_model=umap_model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Setup the page configuration ### | |
st.set_page_config(layout="wide", page_title="Concall Transcripts with GPT") | |
st.title('Concall Transcripts with GPT') | |
openai_api_key = st.sidebar.text_input('OpenAI API Key', value='') | |
if "response_default" not in st.session_state: | |
st.session_state['response_default'] = None | |
### Create UI elements ### | |
if "disabled" not in st.session_state: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Read text from PDF file ### | |
def read_pdf(file): | |
loader = OnlinePDFLoader(file) | |
documents = loader.load() | |
return documents |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
prompt_default = ''' | |
This document is the transcript of an earnings conference call of a company. Assume you are an analyst who | |
attended this call. Identify which company this document is talking about. Identify 10 best questions and their | |
answers that would help summarize the company's performance. | |
Create a report in a markdown format that answers each of those 10 questions. Here is an example of the format. | |
Example | |
## Insert company name here |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Query function for the app ### | |
def get_guery_function(documents): | |
# Text splitter | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=10) | |
texts = text_splitter.split_documents(documents) | |
# Embeddings | |
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
vectordb = Chroma.from_documents(documents=texts, | |
embedding=embeddings, |