Created
August 26, 2025 19:20
-
-
Save anthonykasza/fb4c49eb0035e7491415472987fee96c to your computer and use it in GitHub Desktop.
hdc for hasbeens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import yfinance as yf | |
from hdc_utils import * | |
tickers = [ | |
"AAPL", "ADBE", "ADI", "ADP", "ADSK", "AEP", "ALGN", | |
"AMAT", "AMD", "AMGN", "AMZN", "APP", "ARM", "ASML", | |
"AVGO", "AZN", "AXON", "BIIB", "BKNG", "CDNS", "CDW", | |
"CEG", "CHTR", "CMCSA", "COST", "CPRT", "CRWD", "CSCO", | |
"CSGP", "CSX", "CTAS", "CTSH", "DDOG", "DXCM", "EA", | |
"EXC", "FAST", "FANG", "FTNT", "GEHC", "GFS", "GILD", | |
"GOOG", "HON", "IDXX", "ILMN", "INTC", "INTU", "ISRG", | |
"JD", "KDP", "KHC", "KLAC", "LRCX", "LULU", "MAR", | |
"MCHP", "MDLZ", "MELI", "META", "MNST", "MRVL", "MSFT", | |
"MSTR", "MU", "NFLX", "NXPI", "ODFL", "ON", "ORLY", | |
"PANW", "PAYX", "PCAR", "PDD", "PEPS", "PLTR", "PYPL", | |
"QCOM", "REGN", "ROST", "ROP", "SBUX", "SHOP", "SNPS", | |
"TEAM", "TMUS", "TTD", "TTWO", "TXN", "VRSK", "VRTX", | |
"WBD", "WDAY", "XEL", "ZS" | |
] | |
# get data | |
csv_filename = "cubes.csv" | |
if not os.path.exists(csv_filename): | |
print(" no local data. downing fresh") | |
data = yf.download(tickers, period="300d", interval="1d", group_by='ticker', auto_adjust=True) | |
close_data = pd.DataFrame() | |
for ticker in tickers: | |
if (ticker in data.columns.levels[0]) and ('Close' in data[ticker]): | |
close_data[ticker] = data[ticker]['Close'] | |
else: | |
print(f" skipping {ticker}") | |
close_data.to_csv(csv_filename) | |
else: | |
print(" loading data from file") | |
close_data = pd.read_csv(csv_filename, index_col=0, parse_dates=True) | |
# close-to-close percent changes vector embeddings | |
magic_accuracy = 150 | |
long_df = close_data.reset_index().melt(id_vars=["Date"], var_name="ticker", value_name="price") | |
long_df['percent_change'] = long_df.groupby('ticker')['price'].pct_change(fill_method=None) * magic_accuracy | |
long_df = long_df.dropna(subset=["percent_change"]) | |
max_per_incr = max(long_df['percent_change']) | |
min_per_decr = min(long_df['percent_change']) * -1 | |
dims = int((magic_accuracy+1) * (max_per_incr + min_per_decr)) | |
symbols = make_levels(dims, new_hv(dims), new_hv(dims)) | |
print(f' len symbols before embeddings: {len(symbols.keys())}') | |
def embed_percents(percents, n, global_symbols, min_offset): | |
sublist_symbols = [] | |
percents = [int( magic_accuracy *(p + min_offset) ) for p in percents] | |
for sublist in make_ngrams(percents, n): | |
sublist = tuple(sublist) | |
# subseq exists, use it | |
if sublist in global_symbols.keys(): | |
sublist_symbols.append(global_symbols[sublist]) | |
# new subseq, create it | |
# this could be waaay more efficient | |
# since bind(A,B) = bind(bind(A,B,C),C) search for bind(A,B,C) then bind(A,B) then A | |
else: | |
percent_symbols = [] | |
for idx in range(len(sublist)): | |
percent = sublist[idx] | |
percent_symbol = global_symbols[(percent,)] | |
percent_symbols.append( shift_hv(percent_symbol, idx) ) | |
sublist_symbol = multiply_hv(*percent_symbols) | |
global_symbols[sublist] = sublist_symbol | |
sublist_symbols.append(sublist_symbol) | |
embedding = add_hv(*sublist_symbols) | |
return embedding, global_symbols | |
ticker_percent_changes = { | |
ticker: group['percent_change'].tolist() | |
for ticker, group in long_df.groupby('ticker') | |
} | |
data = {} | |
for ticker, percents in ticker_percent_changes.items(): | |
data[ticker], symbols = embed_percents(percents, 30, symbols, min_per_decr) | |
print(f' len symbols after embeddings: {len(symbols.keys())}') | |
# cluster embeddings | |
import hdbscan | |
from collections import defaultdict | |
tickers = list(data.keys()) | |
hvs = list(data.values()) | |
def dist_hv(a, b): | |
return 1 - compare_hv(a, b) | |
clusterizer = hdbscan.HDBSCAN( | |
min_cluster_size=2, | |
metric=dist_hv, | |
# cluster_selection_method='eom' | |
cluster_selection_method='leaf' | |
) | |
noise = -1 | |
labels = clusterizer.fit_predict(hvs) | |
clusters = defaultdict(list) | |
for idx, label in enumerate(labels): | |
if label != noise: | |
clusters[label].append(tickers[idx]) | |
print('positive correlation clusters') | |
for cluster_id, ticker_list in clusters.items(): | |
for ticker in ticker_list: | |
print(f'{ticker}') | |
print() | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment