Skip to content

Instantly share code, notes, and snippets.

@anthonykasza
Created August 26, 2025 19:20
Show Gist options
  • Save anthonykasza/fb4c49eb0035e7491415472987fee96c to your computer and use it in GitHub Desktop.
Save anthonykasza/fb4c49eb0035e7491415472987fee96c to your computer and use it in GitHub Desktop.
hdc for hasbeens
import os
import pandas as pd
import yfinance as yf
from hdc_utils import *
tickers = [
"AAPL", "ADBE", "ADI", "ADP", "ADSK", "AEP", "ALGN",
"AMAT", "AMD", "AMGN", "AMZN", "APP", "ARM", "ASML",
"AVGO", "AZN", "AXON", "BIIB", "BKNG", "CDNS", "CDW",
"CEG", "CHTR", "CMCSA", "COST", "CPRT", "CRWD", "CSCO",
"CSGP", "CSX", "CTAS", "CTSH", "DDOG", "DXCM", "EA",
"EXC", "FAST", "FANG", "FTNT", "GEHC", "GFS", "GILD",
"GOOG", "HON", "IDXX", "ILMN", "INTC", "INTU", "ISRG",
"JD", "KDP", "KHC", "KLAC", "LRCX", "LULU", "MAR",
"MCHP", "MDLZ", "MELI", "META", "MNST", "MRVL", "MSFT",
"MSTR", "MU", "NFLX", "NXPI", "ODFL", "ON", "ORLY",
"PANW", "PAYX", "PCAR", "PDD", "PEPS", "PLTR", "PYPL",
"QCOM", "REGN", "ROST", "ROP", "SBUX", "SHOP", "SNPS",
"TEAM", "TMUS", "TTD", "TTWO", "TXN", "VRSK", "VRTX",
"WBD", "WDAY", "XEL", "ZS"
]
# get data
csv_filename = "cubes.csv"
if not os.path.exists(csv_filename):
print(" no local data. downing fresh")
data = yf.download(tickers, period="300d", interval="1d", group_by='ticker', auto_adjust=True)
close_data = pd.DataFrame()
for ticker in tickers:
if (ticker in data.columns.levels[0]) and ('Close' in data[ticker]):
close_data[ticker] = data[ticker]['Close']
else:
print(f" skipping {ticker}")
close_data.to_csv(csv_filename)
else:
print(" loading data from file")
close_data = pd.read_csv(csv_filename, index_col=0, parse_dates=True)
# close-to-close percent changes vector embeddings
magic_accuracy = 150
long_df = close_data.reset_index().melt(id_vars=["Date"], var_name="ticker", value_name="price")
long_df['percent_change'] = long_df.groupby('ticker')['price'].pct_change(fill_method=None) * magic_accuracy
long_df = long_df.dropna(subset=["percent_change"])
max_per_incr = max(long_df['percent_change'])
min_per_decr = min(long_df['percent_change']) * -1
dims = int((magic_accuracy+1) * (max_per_incr + min_per_decr))
symbols = make_levels(dims, new_hv(dims), new_hv(dims))
print(f' len symbols before embeddings: {len(symbols.keys())}')
def embed_percents(percents, n, global_symbols, min_offset):
sublist_symbols = []
percents = [int( magic_accuracy *(p + min_offset) ) for p in percents]
for sublist in make_ngrams(percents, n):
sublist = tuple(sublist)
# subseq exists, use it
if sublist in global_symbols.keys():
sublist_symbols.append(global_symbols[sublist])
# new subseq, create it
# this could be waaay more efficient
# since bind(A,B) = bind(bind(A,B,C),C) search for bind(A,B,C) then bind(A,B) then A
else:
percent_symbols = []
for idx in range(len(sublist)):
percent = sublist[idx]
percent_symbol = global_symbols[(percent,)]
percent_symbols.append( shift_hv(percent_symbol, idx) )
sublist_symbol = multiply_hv(*percent_symbols)
global_symbols[sublist] = sublist_symbol
sublist_symbols.append(sublist_symbol)
embedding = add_hv(*sublist_symbols)
return embedding, global_symbols
ticker_percent_changes = {
ticker: group['percent_change'].tolist()
for ticker, group in long_df.groupby('ticker')
}
data = {}
for ticker, percents in ticker_percent_changes.items():
data[ticker], symbols = embed_percents(percents, 30, symbols, min_per_decr)
print(f' len symbols after embeddings: {len(symbols.keys())}')
# cluster embeddings
import hdbscan
from collections import defaultdict
tickers = list(data.keys())
hvs = list(data.values())
def dist_hv(a, b):
return 1 - compare_hv(a, b)
clusterizer = hdbscan.HDBSCAN(
min_cluster_size=2,
metric=dist_hv,
# cluster_selection_method='eom'
cluster_selection_method='leaf'
)
noise = -1
labels = clusterizer.fit_predict(hvs)
clusters = defaultdict(list)
for idx, label in enumerate(labels):
if label != noise:
clusters[label].append(tickers[idx])
print('positive correlation clusters')
for cluster_id, ticker_list in clusters.items():
for ticker in ticker_list:
print(f'{ticker}')
print()
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment