Last active
February 4, 2016 18:04
-
-
Save atremblay/cc78e45d6a70a3f43393 to your computer and use it in GitHub Desktop.
Joint Entropy calculated with SQL for discrete values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is part of the implementation of this paper | |
# https://www.aaai.org/Papers/FLAIRS/1999/FLAIRS99-042.pdf | |
# This was written for MLDB (www.mldb.ai). Not tested for other databases | |
# Conditional entropy $H(Y|X) = -\sum_x{P(x)\sum_y{P(y|x)\log(P(y|x))}}$ | |
from pandas import DataFrame | |
def conditional_entropy(Y, X, table): | |
df = mldb.query(""" | |
SELECT -sum(PX.p_x*A.right) | |
FROM ( | |
SELECT PXY.X as X, sum(PXY.p_xy * ln(PXY.p_xy)) as right | |
FROM ( | |
SELECT SUB.Y as Y, SUB.X as X, SUB.sub_total/TOT.total as p_xy | |
FROM ( | |
SELECT {Y} as Y, {X} as X, count(*) as sub_total | |
FROM {table} | |
GROUP BY {Y}, {X}) as SUB | |
JOIN ( | |
SELECT {X} as X, count(*) as total | |
FROM {table} | |
GROUP BY {X}) as TOT | |
ON SUB.X = TOT.X | |
) as PXY | |
GROUP BY PXY.X | |
) as A | |
JOIN ( | |
SELECT SUB.X as X, SUB.sub_total/TOT.total as p_x | |
FROM ( | |
SELECT {X} as X, count(*) as sub_total | |
FROM {table} | |
GROUP BY {X}) as SUB | |
JOIN ( | |
SELECT count(*) as total | |
FROM {table} | |
) as TOT | |
) as PX | |
ON PX.X = A.X | |
""".format(table=table, X=X, Y=Y) | |
) | |
if not isinstance(df, DataFrame): | |
print df.content | |
else: | |
return df.values[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This was written for MLDB (www.mldb.ai). Not tested for other databases | |
# Entropy $H(X) = -\sum_x{P(x)\log(P(x))}}$ | |
from pandas import DataFrame | |
def entropy(X, table): | |
df = mldb.query(""" | |
SELECT -sum(p_x*ln(p_x)) | |
FROM ( | |
SELECT SUB.sub_total/TOT.total as p_x | |
FROM ( | |
SELECT {X} as X, count(*) as sub_total | |
FROM {table} | |
GROUP BY {X}) as SUB | |
JOIN ( | |
SELECT count(*) as total | |
FROM {table} | |
) as TOT | |
) | |
""".format(table=table, X=X) | |
) | |
if not isinstance(df, DataFrame): | |
print df.content | |
else: | |
return df.values[0][0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This was written for MLDB (www.mldb.ai). Not tested for other databases | |
# Joint entropy $H(X, Y) = -\sum_{x,y}{P(x,y)\log(P(x,y))}$ | |
from pandas import DataFrame | |
def joint_entropy(Y, X, table): | |
df = mldb.query(""" | |
SELECT -sum(p_xy*ln(p_xy)) | |
FROM ( | |
SELECT SUB.sub_total/TOT.total as p_xy | |
FROM ( | |
SELECT {X} as X, {Y} as Y, count(*) as sub_total | |
FROM {table} | |
GROUP BY {X}, {Y}) as SUB | |
JOIN ( | |
SELECT count(*) as total | |
FROM {table} | |
) as TOT | |
) | |
""".format(table=table, X=X, Y=Y) | |
) | |
if not isinstance(df, DataFrame): | |
print df.content | |
else: | |
return df.values[0][0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment