Created
November 30, 2021 09:16
-
-
Save billju/8379aa09a87812255a126a44881b1739 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 定義計算方法 | |
log2 = lambda x: np.log2(x,where=x!=0) | |
I = lambda *args: sum([-arg/sum(args)*log2(arg/sum(args)) for arg in args]) | |
G = lambda *args: sum([1-(arg/sum(args))**2 for arg in args]) | |
Cum = lambda F, P, y: P.mean() * F(*[ (P&(y==v)).sum() for v in y.unique() ]) | |
CumSum = lambda F, S, y: sum([ Cum(F, S==v, y) for v in S.unique() ]) | |
E = lambda S, y: CumSum(I, S, y) | |
Gain = lambda S, y: I(*y.value_counts()) - E(S, y) | |
SplitGain = lambda S: I(*S.value_counts()) | |
GainRatio = lambda S, y: Gain(S,y) / SplitGain(S) | |
Gini = lambda S, y: G(*y.value_counts()) - CumSum(G, S, y) | |
# 定義列印方法 | |
printI = lambda *args: ''.join([f'-{arg}/{sum(args)}*log2({arg}/{sum(args)})' for arg in args]) | |
printCum = lambda F, P, y: f'{P.sum()}/{P.count()}' + '*(' + F(*[ (P&(y==v)).sum() for v in y.unique() ]) + ')' | |
printCumSum = lambda F, S, y: ' + \n'.join([ printCum(F, S==v, y) for v in S.unique() ]) | |
printGain = lambda S, y: printI(*y.value_counts()) + ' - (\n' + printCumSum(printI, S, y) + f')\n= {Gain(S,y)}' | |
printGainRatio = lambda S, y: printGain(S, y) + ' / (' + printI(*S.value_counts()) + f')\n= {GainRatio(S,y)}' | |
# 讀取資料 | |
CSV = """客戶編號 最近三個月被銀行查詢家數 收入 最近三個月他行使用循環註記 是否違約 | |
1 0次 高 無 No | |
2 0次 高 無 No | |
3 1~2次 高 無 Yes | |
4 3次以上 中等 無 Yes | |
5 3次以上 低 有 Yes | |
6 1~2次 低 有 Yes | |
7 1~2次 低 有 Yes | |
8 0次 中等 無 Yes | |
9 0次 低 有 No | |
10 3次以上 中等 有 Yes | |
11 0次 中等 有 Yes | |
12 1~2次 中等 無 No | |
13 1~2次 高 有 No | |
14 3次以上 低 無 No""" | |
df = pd.read_csv(io.StringIO(CSV), sep=' ').drop(columns=['客戶編號']) | |
y = df['是否違約']=='Yes' | |
print(printGain(df['最近三個月被銀行查詢家數'], y)) | |
print(printGain(df['收入'], y)) | |
print(printGain(df['最近三個月他行使用循環註記'], y)) | |
print(printGainRatio(df['最近三個月被銀行查詢家數'], y)) | |
print(printGainRatio(df['收入'], y)) | |
print(printGainRatio(df['最近三個月他行使用循環註記'], y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment