quantra-go-algo · May 3, 2026 20:47
diff --git a/deepseek_regime_labels.py b/deepseek_regime_labels.py
 def load_cache() -> dict:
    p = Path(LLM_CACHE_FILE)
    if p.exists():
        return json.loads(p.read_text(encoding="utf-8"))
    return {}


 def save_cache(cache: dict) -> None:
    Path(LLM_CACHE_FILE).write_text(json.dumps(cache, indent=2, sort_keys=True), encoding="utf-8")


 def parse_regime(text: str) -> str:
    """DeepSeek should return JSON; we also try regex fallback."""
    t = text.strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, dict):
            r = str(obj.get("regime", "")).upper().strip()
            if r in REGIME_SET:
                return r
    except Exception:
        pass

    m = re.search(r"\b(TREND_UP|TREND_DOWN|RANGE|HIGH_VOL|LOW_VOL|UNCERTAIN)\b", t.upper())
    if m:
        return m.group(1)
    return "UNCERTAIN"


 def deepseek_label(summary: dict) -> str:
    url = DEEPSEEK_BASE_URL.rstrip("/") + "/chat/completions"
    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}

    system = (
        "You are a quantitative researcher.\n"
        "Label the market regime using ONLY the numeric summary provided.\n"
        "Do not use outside knowledge.\n"
        "Return STRICT JSON only: {\"regime\": <one label>, \"reason\": <short>}."
    )
    user = {
        "summary": summary,
        "allowed_labels": REGIME_SET,
        "hint": {
            "TREND_UP": "trend_score > 0 and not high vol",
            "TREND_DOWN": "trend_score < 0 and not high vol",
            "RANGE": "|trend_score| small; z_last suggests mean reversion",
            "HIGH_VOL": "ann_vol high or atr_norm high or max_dd very negative",
            "LOW_VOL": "ann_vol low and atr_norm low",
            "UNCERTAIN": "if ambiguous",
        },
    }

    payload = {
        "model": DEEPSEEK_MODEL,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user)},
        ],
        "temperature": 0.0,
        "max_tokens": 120,
        "stream": False,
    }

    r = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT_S)
    r.raise_for_status()
    content = r.json()["choices"][0]["message"]["content"]
    return parse_regime(content)


 def label_regimes_llm(df_feat: pd.DataFrame) -> pd.Series:
    """
    Label regimes with DeepSeek for the full history (but only at label dates).
    Labels are forward-filled across days.
    Cached to JSON so re-runs are cheap.
    """
    cache = load_cache()
    labels = {}

    for t in label_dates(df_feat.index):
        i = df_feat.index.get_loc(t)
        w = df_feat.iloc[i - LOOKBACK_DAYS : i]
        s = window_summary(w)

        cache_key = f"{t.date().isoformat()}::{SYMBOL}::{LOOKBACK_DAYS}::{DEEPSEEK_MODEL}"
        if cache_key in cache:
            labels[t] = cache[cache_key]
            continue

        reg = deepseek_label(s)
        if reg not in REGIME_SET:
            reg = "UNCERTAIN"

        cache[cache_key] = reg
        save_cache(cache)
        labels[t] = reg
        time.sleep(LLM_RATE_LIMIT_S)

    sparse = pd.Series(labels).sort_index()
    return sparse.reindex(df_feat.index).ffill().fillna("UNCERTAIN")
	def load_cache() -> dict:
	p = Path(LLM_CACHE_FILE)
	if p.exists():
	return json.loads(p.read_text(encoding="utf-8"))
	return {}


	def save_cache(cache: dict) -> None:
	Path(LLM_CACHE_FILE).write_text(json.dumps(cache, indent=2, sort_keys=True), encoding="utf-8")


	def parse_regime(text: str) -> str:
	"""DeepSeek should return JSON; we also try regex fallback."""
	t = text.strip()
	try:
	obj = json.loads(t)
	if isinstance(obj, dict):
	r = str(obj.get("regime", "")).upper().strip()
	if r in REGIME_SET:
	return r
	except Exception:
	pass

	m = re.search(r"\b(TREND_UP\|TREND_DOWN\|RANGE\|HIGH_VOL\|LOW_VOL\|UNCERTAIN)\b", t.upper())
	if m:
	return m.group(1)
	return "UNCERTAIN"


	def deepseek_label(summary: dict) -> str:
	url = DEEPSEEK_BASE_URL.rstrip("/") + "/chat/completions"
	headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}

	system = (
	"You are a quantitative researcher.\n"
	"Label the market regime using ONLY the numeric summary provided.\n"
	"Do not use outside knowledge.\n"
	"Return STRICT JSON only: {\"regime\": <one label>, \"reason\": <short>}."
	)
	user = {
	"summary": summary,
	"allowed_labels": REGIME_SET,
	"hint": {
	"TREND_UP": "trend_score > 0 and not high vol",
	"TREND_DOWN": "trend_score < 0 and not high vol",
	"RANGE": "\|trend_score\| small; z_last suggests mean reversion",
	"HIGH_VOL": "ann_vol high or atr_norm high or max_dd very negative",
	"LOW_VOL": "ann_vol low and atr_norm low",
	"UNCERTAIN": "if ambiguous",
	},
	}

	payload = {
	"model": DEEPSEEK_MODEL,
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": json.dumps(user)},
	],
	"temperature": 0.0,
	"max_tokens": 120,
	"stream": False,
	}

	r = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT_S)
	r.raise_for_status()
	content = r.json()["choices"][0]["message"]["content"]
	return parse_regime(content)


	def label_regimes_llm(df_feat: pd.DataFrame) -> pd.Series:
	"""
	Label regimes with DeepSeek for the full history (but only at label dates).
	Labels are forward-filled across days.
	Cached to JSON so re-runs are cheap.
	"""
	cache = load_cache()
	labels = {}

	for t in label_dates(df_feat.index):
	i = df_feat.index.get_loc(t)
	w = df_feat.iloc[i - LOOKBACK_DAYS : i]
	s = window_summary(w)

	cache_key = f"{t.date().isoformat()}::{SYMBOL}::{LOOKBACK_DAYS}::{DEEPSEEK_MODEL}"
	if cache_key in cache:
	labels[t] = cache[cache_key]
	continue

	reg = deepseek_label(s)
	if reg not in REGIME_SET:
	reg = "UNCERTAIN"

	cache[cache_key] = reg
	save_cache(cache)
	labels[t] = reg
	time.sleep(LLM_RATE_LIMIT_S)

	sparse = pd.Series(labels).sort_index()
	return sparse.reindex(df_feat.index).ffill().fillna("UNCERTAIN")
No results found