Created
December 8, 2017 08:42
-
-
Save porimol/57c1a10a0624166aa8aff0218d797d73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
data = pd.read_csv('test.csv') | |
# array3 = data['Column2'].replace(np.NaN,-1) | |
# length = array3.shape[0] | |
# print(length) | |
# arr3 = []; | |
# for i in range(length): | |
# if array3[i] != -1: | |
# # print(i) | |
# arr3.append(array[i]) | |
# | |
# print(arr3) | |
# trimmed = st.trim_mean(arr3,.05) | |
# mask=(data['Column1']=='Yes') & ((data['Column2'=='NA']) |(data['Column2'=='No']) | |
# data.loc[mask,'Column2']=pd.to_numeric(data['Column2'],errors='coerce').fillna(trimmed) | |
data['Column2'] = data['Column2'].fillna(data.apply(lambda cols: round(data['Column2'].mean(),2) if cols[0]=='yes' else -1, axis=1)) | |
data.to_csv(r'data.csv', index=False) |
@PP227 have a look. I added here the solution with two sample output.
from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
if array3[i] != -1:
arr3.append(array3[i])
# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)
# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data
Sample Output 1
Hypertension | Hypertension_medicine_years | Diabetes | Diabetes_medicine_years | |
---|---|---|---|---|
0 | Yes | 1 | No | NaN |
1 | Yes | 6 | Yes | 13.00 |
2 | Yes | VALID | No | NaN |
3 | NaN | NaN | NaN | NaN |
4 | Yes | 2 | No | NaN |
5 | Yes | 6 | No | NaN |
6 | Yes | 14 | Yes | 14.00 |
7 | Yes | 5 | No | NaN |
8 | Yes | 0.5 | Yes | 2.00 |
9 | No | NaN | No | NaN |
10 | Yes | VALID | Yes | 3.00 |
11 | No | NaN | No | NaN |
12 | No | NaN | No | NaN |
13 | Yes | VALID | No | NaN |
14 | No | NaN | No | NaN |
15 | Yes | 2 | No | NaN |
16 | No | NaN | No | NaN |
17 | No | NaN | No | NaN |
18 | Yes | VALID | Yes | 7.00 |
19 | Yes | VALID | Yes | 5.95 |
20 | No | NaN | Yes | 7.00 |
21 | No | NaN | No | NaN |
22 | Yes | 4 | No | NaN |
23 | No | NaN | Yes | 3.00 |
24 | No | NaN | No | NaN |
25 | Yes | 2 | Yes | 5.95 |
26 | Yes | 5 | No | NaN |
27 | Yes | 3 | Yes | 2.00 |
28 | Yes | 10 | No | NaN |
29 | Yes | 2 | Yes | 8.00 |
30 | Yes | 0.5 | Yes | 0.50 |
31 | No | NaN | No | NaN |
32 | Yes | 10 | No | NaN |
33 | No | NaN | No | NaN |
from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
if array3[i] != -1:
arr3.append(array3[i])
# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)
# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data
Sample Output 2
Hypertension | Hypertension_medicine_years | Diabetes | Diabetes_medicine_years | |
---|---|---|---|---|
0 | Yes | 1 | No | |
1 | Yes | 6 | Yes | 13 |
2 | Yes | VALID | No | |
3 | NaN | NaN | NaN | |
4 | Yes | 2 | No | |
5 | Yes | 6 | No | |
6 | Yes | 14 | Yes | 14 |
7 | Yes | 5 | No | |
8 | Yes | 0.5 | Yes | 2 |
9 | No | NaN | No | |
10 | Yes | VALID | Yes | 3 |
11 | No | NaN | No | |
12 | No | NaN | No | |
13 | Yes | VALID | No | |
14 | No | NaN | No | |
15 | Yes | 2 | No | |
16 | No | NaN | No | |
17 | No | NaN | No | |
18 | Yes | VALID | Yes | 7 |
19 | Yes | VALID | Yes | 5.95 |
20 | No | NaN | Yes | 7 |
21 | No | NaN | No | |
22 | Yes | 4 | No | |
23 | No | NaN | Yes | 3 |
24 | No | NaN | No | |
25 | Yes | 2 | Yes | 5.95 |
26 | Yes | 5 | No | |
27 | Yes | 3 | Yes | 2 |
28 | Yes | 10 | No | |
29 | Yes | 2 | Yes | 8 |
30 | Yes | 0.5 | Yes | 0.5 |
31 | No | NaN | No | |
32 | Yes | 10 | No | |
33 | No | NaN | No |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
File "", line 14
data.loc[mask,'Cholesterol medicine years']=pd.to_numeric(data['Cholesterol medicine years'],errors='coerce').fillna(trimmed)
^
SyntaxError: invalid syntax