Skip to content

Instantly share code, notes, and snippets.

@atremblay
Last active February 4, 2016 18:04
Show Gist options
  • Save atremblay/cc78e45d6a70a3f43393 to your computer and use it in GitHub Desktop.
Save atremblay/cc78e45d6a70a3f43393 to your computer and use it in GitHub Desktop.
Joint Entropy calculated with SQL for discrete values
# This is part of the implementation of this paper
# https://www.aaai.org/Papers/FLAIRS/1999/FLAIRS99-042.pdf
# This was written for MLDB (www.mldb.ai). Not tested for other databases
# Conditional entropy $H(Y|X) = -\sum_x{P(x)\sum_y{P(y|x)\log(P(y|x))}}$
from pandas import DataFrame
def conditional_entropy(Y, X, table):
df = mldb.query("""
SELECT -sum(PX.p_x*A.right)
FROM (
SELECT PXY.X as X, sum(PXY.p_xy * ln(PXY.p_xy)) as right
FROM (
SELECT SUB.Y as Y, SUB.X as X, SUB.sub_total/TOT.total as p_xy
FROM (
SELECT {Y} as Y, {X} as X, count(*) as sub_total
FROM {table}
GROUP BY {Y}, {X}) as SUB
JOIN (
SELECT {X} as X, count(*) as total
FROM {table}
GROUP BY {X}) as TOT
ON SUB.X = TOT.X
) as PXY
GROUP BY PXY.X
) as A
JOIN (
SELECT SUB.X as X, SUB.sub_total/TOT.total as p_x
FROM (
SELECT {X} as X, count(*) as sub_total
FROM {table}
GROUP BY {X}) as SUB
JOIN (
SELECT count(*) as total
FROM {table}
) as TOT
) as PX
ON PX.X = A.X
""".format(table=table, X=X, Y=Y)
)
if not isinstance(df, DataFrame):
print df.content
else:
return df.values[0]
# This was written for MLDB (www.mldb.ai). Not tested for other databases
# Entropy $H(X) = -\sum_x{P(x)\log(P(x))}}$
from pandas import DataFrame
def entropy(X, table):
df = mldb.query("""
SELECT -sum(p_x*ln(p_x))
FROM (
SELECT SUB.sub_total/TOT.total as p_x
FROM (
SELECT {X} as X, count(*) as sub_total
FROM {table}
GROUP BY {X}) as SUB
JOIN (
SELECT count(*) as total
FROM {table}
) as TOT
)
""".format(table=table, X=X)
)
if not isinstance(df, DataFrame):
print df.content
else:
return df.values[0][0]
# This was written for MLDB (www.mldb.ai). Not tested for other databases
# Joint entropy $H(X, Y) = -\sum_{x,y}{P(x,y)\log(P(x,y))}$
from pandas import DataFrame
def joint_entropy(Y, X, table):
df = mldb.query("""
SELECT -sum(p_xy*ln(p_xy))
FROM (
SELECT SUB.sub_total/TOT.total as p_xy
FROM (
SELECT {X} as X, {Y} as Y, count(*) as sub_total
FROM {table}
GROUP BY {X}, {Y}) as SUB
JOIN (
SELECT count(*) as total
FROM {table}
) as TOT
)
""".format(table=table, X=X, Y=Y)
)
if not isinstance(df, DataFrame):
print df.content
else:
return df.values[0][0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment