|
#!/usr/bin/env python3 |
|
""" |
|
UOR-Based Titanic Survival Classification |
|
------------------------------------------- |
|
|
|
This script demonstrates the integration of the Universal Object Reference (UOR) |
|
framework into a data science pipeline for Titanic survival classification. |
|
|
|
The UOR framework unifies representations using: |
|
• Clifford algebras to encode multimodal data in a single invariant space. |
|
• Base decomposition: representing each object in multiple “bases” (e.g. numeric & one‐hot) |
|
with consistency enforced by a coherence norm. |
|
• Lie group symmetry transformations as automorphisms of the Clifford algebra. |
|
|
|
The UOR Framework Enables: |
|
• Unified Multimodal Embedding: UOR uses Clifford algebras to integrate diverse data types |
|
(numeric, categorical, text, etc.) into a single, consistent representation. |
|
• Built-in Consistency: The coherence norm enforces that redundant representations (e.g., |
|
numeric codes and one-hot encodings) agree, improving data quality and interpretability. |
|
• Symmetry Invariance: By leveraging Lie group symmetries, UOR ensures that the |
|
representation is robust to transformations (e.g., rotations or permutations), which can |
|
lead to models that generalize better. |
|
• Interpretable Geometry: The geometric structure of the embedding allows for clear |
|
interpretation of feature interactions and model decisions in a coordinate-free manner. |
|
• Potential for Advanced Applications: Particularly useful in scenarios involving multi-modal |
|
data fusion or where inherent symmetries exist (e.g., image recognition, sensor fusion), |
|
offering a new perspective compared to traditional pipelines. |
|
|
|
References: |
|
- https://github.com/UOR-Foundation/UOR-H1-HPO-Candidate |
|
|
|
Learn more about UOR: |
|
- https://chatgpt.com/g/g-67abc8d515d48191933ccb10df257338-universal-object-reference-uor-guide |
|
|
|
Requirements: |
|
Python 3.8+ |
|
numpy, pandas, scikit-learn |
|
""" |
|
|
|
import math |
|
import pandas as pd |
|
from sklearn.linear_model import LogisticRegression |
|
|
|
############################## |
|
# 1. Clifford Algebra Setup |
|
############################## |
|
|
|
# Define basis vectors for features: |
|
# Numeric: Age, Fare, SibSp, Parch, PclassNum, SexNum, EmbarkedNum |
|
# Categorical (one-hot): Pclass1, Pclass2, Pclass3, Male, EmbarkedC, EmbarkedQ, EmbarkedS |
|
basis_index = { |
|
'Age': 1, 'Fare': 2, 'SibSp': 3, 'Parch': 4, |
|
'PclassNum': 5, 'SexNum': 6, 'EmbarkedNum': 7, |
|
'Pclass1': 8, 'Pclass2': 9, 'Pclass3': 10, |
|
'Male': 11, |
|
'EmbarkedC': 12, 'EmbarkedQ': 13, 'EmbarkedS': 14 |
|
} |
|
n = len(basis_index) # Dimension of V |
|
|
|
# All basis vectors have Q(e_i)=1 (Euclidean metric) |
|
|
|
# Helper: represent a basis blade as a sorted tuple of basis indices. |
|
# E.g., () for scalar 1, (5,) for e5, (2,8) for e2*e8, etc. |
|
def blade_tuple(*indices): |
|
return tuple(sorted(indices)) |
|
|
|
############################## |
|
# 2. Multivector Encoding Functions |
|
############################## |
|
|
|
def encode_passenger(passenger): |
|
""" |
|
Encode a passenger's data into a Clifford multivector. |
|
|
|
passenger: dict with keys: |
|
'Age', 'Fare', 'SibSp', 'Parch', 'Pclass', 'Sex', 'Embarked' |
|
- 'Pclass': integer 1,2,3 |
|
- 'Sex': 'male' or 'female' |
|
- 'Embarked': 'C', 'Q', or 'S' |
|
|
|
Returns: |
|
A multivector represented as a dict: {blade (tuple): coefficient}. |
|
""" |
|
mv = {} |
|
# Numeric features |
|
if passenger.get('Age') is not None: |
|
mv[blade_tuple(basis_index['Age'])] = passenger['Age'] |
|
if passenger.get('Fare') is not None: |
|
mv[blade_tuple(basis_index['Fare'])] = passenger['Fare'] |
|
mv[blade_tuple(basis_index['SibSp'])] = passenger.get('SibSp', 0) |
|
mv[blade_tuple(basis_index['Parch'])] = passenger.get('Parch', 0) |
|
|
|
# Pclass: numeric and one-hot |
|
pclass = passenger['Pclass'] |
|
mv[blade_tuple(basis_index['PclassNum'])] = float(pclass) |
|
mv[blade_tuple(basis_index[f'Pclass{pclass}'])] = 1.0 |
|
|
|
# Sex: numeric and one-hot (Male) |
|
sex = passenger['Sex'].lower() |
|
sex_num = 1.0 if sex == 'male' else 0.0 |
|
mv[blade_tuple(basis_index['SexNum'])] = sex_num |
|
mv[blade_tuple(basis_index['Male'])] = 1.0 if sex == 'male' else 0.0 |
|
|
|
# Embarked: numeric and one-hot |
|
embarked = passenger.get('Embarked', '') |
|
embarked_map = {'C': 1.0, 'Q': 2.0, 'S': 3.0} |
|
if embarked: |
|
mv[blade_tuple(basis_index['EmbarkedNum'])] = embarked_map[embarked] |
|
mv[blade_tuple(basis_index[f'Embarked{embarked}'])] = 1.0 |
|
# Ensure other ports are explicitly set to 0 |
|
for port in ['C', 'Q', 'S']: |
|
if port != embarked: |
|
mv[blade_tuple(basis_index[f'Embarked{port}'])] = mv.get(blade_tuple(basis_index[f'Embarked{port}']), 0.0) |
|
else: |
|
mv[blade_tuple(basis_index['EmbarkedNum'])] = 0.0 |
|
mv[blade_tuple(basis_index['EmbarkedC'])] = 0.0 |
|
mv[blade_tuple(basis_index['EmbarkedQ'])] = 0.0 |
|
mv[blade_tuple(basis_index['EmbarkedS'])] = 0.0 |
|
return mv |
|
|
|
############################## |
|
# 3. Clifford Algebra Operations |
|
############################## |
|
|
|
# Precompute bit masks for basis indices |
|
basis_mask = {i: 1 << (i-1) for i in range(1, n+1)} |
|
|
|
def multiply_mv(mv1, mv2): |
|
"""Multiply two multivectors mv1 and mv2.""" |
|
result = {} |
|
for blade1, c1 in mv1.items(): |
|
mask1 = 0 |
|
for i in blade1: |
|
mask1 |= basis_mask[i] |
|
for blade2, c2 in mv2.items(): |
|
mask2 = 0 |
|
for j in blade2: |
|
mask2 |= basis_mask[j] |
|
coeff = c1 * c2 |
|
common = mask1 & mask2 |
|
if common: |
|
mask1_nodup = mask1 & ~common |
|
mask2_nodup = mask2 & ~common |
|
else: |
|
mask1_nodup = mask1 |
|
mask2_nodup = mask2 |
|
sign = 1 |
|
m1 = mask1_nodup |
|
while m1: |
|
lowest_bit = m1 & -m1 |
|
bit_index = lowest_bit.bit_length() - 1 |
|
lower_mask = (1 << bit_index) - 1 |
|
lower_bits = mask2_nodup & lower_mask |
|
if lower_bits: |
|
if bin(lower_bits).count("1") % 2 == 1: |
|
sign *= -1 |
|
m1 &= (m1 - 1) |
|
coeff *= sign |
|
new_mask = mask1_nodup ^ mask2_nodup |
|
if new_mask == 0: |
|
new_blade = () |
|
else: |
|
indices = [] |
|
m = new_mask |
|
while m: |
|
lb = m & -m |
|
idx = lb.bit_length() - 1 + 1 |
|
indices.append(idx) |
|
m &= (m - 1) |
|
new_blade = tuple(sorted(indices)) |
|
if coeff != 0: |
|
result[new_blade] = result.get(new_blade, 0) + coeff |
|
return result |
|
|
|
# Test basic multiplication |
|
e_age = {blade_tuple(basis_index['Age']): 1.0} |
|
e_fare = {blade_tuple(basis_index['Fare']): 1.0} |
|
prod = multiply_mv(e_age, e_fare) |
|
print("e_Age * e_Fare =", prod) # Expect: {(1,2): 1.0} |
|
prod2 = multiply_mv(e_fare, e_age) |
|
print("e_Fare * e_Age =", prod2) # Expect: {(1,2): -1.0} |
|
|
|
############################## |
|
# 4. Coherence Norm for Base Decomposition |
|
############################## |
|
|
|
def coherence_norm(mv): |
|
""" |
|
Compute coherence norm for a multivector mv. |
|
Enforces consistency between numeric and one-hot encodings. |
|
""" |
|
val = lambda name: mv.get(blade_tuple(basis_index[name]), 0.0) |
|
term_sex = val('SexNum') - val('Male') |
|
term_pclass = val('PclassNum') - (1 * val('Pclass1') + 2 * val('Pclass2') + 3 * val('Pclass3')) |
|
term_pclass_onehot = val('Pclass1') + val('Pclass2') + val('Pclass3') - 1 |
|
term_embarked = val('EmbarkedNum') - (1 * val('EmbarkedC') + 2 * val('EmbarkedQ') + 3 * val('EmbarkedS')) |
|
term_embarked_onehot = val('EmbarkedC') + val('EmbarkedQ') + val('EmbarkedS') - 1 |
|
total = (term_sex ** 2 + term_pclass ** 2 + |
|
term_pclass_onehot ** 2 + term_embarked ** 2 + |
|
term_embarked_onehot ** 2) |
|
return total |
|
|
|
# Example: test coherence norm on a sample passenger |
|
sample_passenger = {"Pclass": 3, "Sex": "female", "Age": 25.0, "SibSp": 0, "Parch": 1, "Fare": 7.25, "Embarked": "S"} |
|
mv_sample = encode_passenger(sample_passenger) |
|
print("Coherence norm (should be 0):", coherence_norm(mv_sample)) |
|
|
|
############################## |
|
# 5. Symmetry Transformations (Example: Rotation in Age-Fare Plane) |
|
############################## |
|
|
|
# Construct rotor for 45-degree rotation in (Age, Fare) plane |
|
theta = math.pi / 4 # 45 degrees |
|
a = math.cos(theta / 2) |
|
b = math.sin(theta / 2) |
|
rotor = { |
|
(): a, |
|
blade_tuple(basis_index['Age'], basis_index['Fare']): b |
|
} |
|
rotor_inv = { |
|
(): a, |
|
blade_tuple(basis_index['Age'], basis_index['Fare']): -b |
|
} |
|
|
|
def rotate_age_fare(mv): |
|
"""Rotate multivector mv in the Age-Fare plane.""" |
|
mv_rot = multiply_mv(rotor, mv) |
|
mv_rot = multiply_mv(mv_rot, rotor_inv) |
|
return mv_rot |
|
|
|
rotated_sample = rotate_age_fare(mv_sample) |
|
print("Original Age coeff:", mv_sample.get(blade_tuple(basis_index['Age']), 0.0), |
|
"Fare coeff:", mv_sample.get(blade_tuple(basis_index['Fare']), 0.0)) |
|
print("Rotated Age coeff:", rotated_sample.get(blade_tuple(basis_index['Age']), 0.0), |
|
"Fare coeff:", rotated_sample.get(blade_tuple(basis_index['Fare']), 0.0)) |
|
print("Coherence norm after rotation:", coherence_norm(rotated_sample)) |
|
|
|
############################## |
|
# 6. Data Preparation and Logistic Regression |
|
############################## |
|
|
|
# Load Titanic data (example data provided for demonstration) |
|
data = [ |
|
{"Survived": 0, "Pclass": 3, "Sex": "male", "Age": 22.0, "SibSp": 1, "Parch": 0, "Fare": 7.25, "Embarked": "S"}, |
|
{"Survived": 1, "Pclass": 1, "Sex": "female","Age": 38.0, "SibSp": 1, "Parch": 0, "Fare": 71.28, "Embarked": "C"}, |
|
{"Survived": 1, "Pclass": 3, "Sex": "female","Age": 26.0, "SibSp": 0, "Parch": 0, "Fare": 7.92, "Embarked": "S"}, |
|
{"Survived": 1, "Pclass": 1, "Sex": "female","Age": 35.0, "SibSp": 1, "Parch": 0, "Fare": 53.1, "Embarked": "S"}, |
|
{"Survived": 0, "Pclass": 3, "Sex": "male", "Age": 35.0, "SibSp": 0, "Parch": 0, "Fare": 8.05, "Embarked": "S"} |
|
] |
|
df = pd.DataFrame(data) |
|
|
|
# Encode passengers into multivectors and extract feature vectors |
|
X_list = [] |
|
y_list = [] |
|
for _, row in df.iterrows(): |
|
passenger = { |
|
"Pclass": int(row.Pclass), |
|
"Sex": row.Sex, |
|
"Age": float(row.Age) if not pd.isnull(row.Age) else None, |
|
"SibSp": int(row.SibSp), |
|
"Parch": int(row.Parch), |
|
"Fare": float(row.Fare) if not pd.isnull(row.Fare) else None, |
|
"Embarked": str(row.Embarked) if pd.notnull(row.Embarked) else '' |
|
} |
|
mv = encode_passenger(passenger) |
|
# Ensure coherence |
|
assert abs(coherence_norm(mv)) < 1e-8, "Data inconsistency found!" |
|
# Extract features in order: basis indices 1..n |
|
feat_vec = [mv.get(blade_tuple(i), 0.0) for i in range(1, n+1)] |
|
X_list.append(feat_vec) |
|
y_list.append(int(row.Survived)) |
|
|
|
# Create feature matrix and label vector |
|
X = pd.DataFrame(X_list, columns=[f"b{i}" for i in range(1, n+1)]) |
|
y = pd.Series(y_list) |
|
print("Feature matrix:\n", X) |
|
print("Labels:", y.values) |
|
|
|
# Train logistic regression model |
|
model = LogisticRegression(solver='lbfgs', fit_intercept=True) |
|
model.fit(X, y) |
|
coefficients = model.coef_[0] |
|
intercept = model.intercept_[0] |
|
|
|
# Map coefficients to basis names for interpretability |
|
basis_names = list(basis_index.keys()) |
|
coef_dict = {basis_names[i]: coefficients[i] for i in range(len(basis_names))} |
|
print("\nIntercept:", intercept) |
|
print("Coefficients per basis:") |
|
for name, coef in coef_dict.items(): |
|
print(f" {name}: {coef:.3f}") |
|
|
|
|
|
if __name__ == '__main__': |
|
print("\nUOR-Based Titanic Survival Classification complete.") |