Created
December 8, 2017 08:42
-
-
Save porimol/57c1a10a0624166aa8aff0218d797d73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
data = pd.read_csv('test.csv') | |
# array3 = data['Column2'].replace(np.NaN,-1) | |
# length = array3.shape[0] | |
# print(length) | |
# arr3 = []; | |
# for i in range(length): | |
# if array3[i] != -1: | |
# # print(i) | |
# arr3.append(array[i]) | |
# | |
# print(arr3) | |
# trimmed = st.trim_mean(arr3,.05) | |
# mask=(data['Column1']=='Yes') & ((data['Column2'=='NA']) |(data['Column2'=='No']) | |
# data.loc[mask,'Column2']=pd.to_numeric(data['Column2'],errors='coerce').fillna(trimmed) | |
data['Column2'] = data['Column2'].fillna(data.apply(lambda cols: round(data['Column2'].mean(),2) if cols[0]=='yes' else -1, axis=1)) | |
data.to_csv(r'data.csv', index=False) |
In my code I had calculated trimmed mean of the values of column2. I want to replace with that trimmed mean.
getting syntax error ,why??
@PP227,
can you show me the error(s)?
File "", line 14
data.loc[mask,'Cholesterol medicine years']=pd.to_numeric(data['Cholesterol medicine years'],errors='coerce').fillna(trimmed)
^
SyntaxError: invalid syntax
@PP227 have a look. I added here the solution with two sample output.
from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
if array3[i] != -1:
arr3.append(array3[i])
# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)
# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data
Sample Output 1
Hypertension | Hypertension_medicine_years | Diabetes | Diabetes_medicine_years | |
---|---|---|---|---|
0 | Yes | 1 | No | NaN |
1 | Yes | 6 | Yes | 13.00 |
2 | Yes | VALID | No | NaN |
3 | NaN | NaN | NaN | NaN |
4 | Yes | 2 | No | NaN |
5 | Yes | 6 | No | NaN |
6 | Yes | 14 | Yes | 14.00 |
7 | Yes | 5 | No | NaN |
8 | Yes | 0.5 | Yes | 2.00 |
9 | No | NaN | No | NaN |
10 | Yes | VALID | Yes | 3.00 |
11 | No | NaN | No | NaN |
12 | No | NaN | No | NaN |
13 | Yes | VALID | No | NaN |
14 | No | NaN | No | NaN |
15 | Yes | 2 | No | NaN |
16 | No | NaN | No | NaN |
17 | No | NaN | No | NaN |
18 | Yes | VALID | Yes | 7.00 |
19 | Yes | VALID | Yes | 5.95 |
20 | No | NaN | Yes | 7.00 |
21 | No | NaN | No | NaN |
22 | Yes | 4 | No | NaN |
23 | No | NaN | Yes | 3.00 |
24 | No | NaN | No | NaN |
25 | Yes | 2 | Yes | 5.95 |
26 | Yes | 5 | No | NaN |
27 | Yes | 3 | Yes | 2.00 |
28 | Yes | 10 | No | NaN |
29 | Yes | 2 | Yes | 8.00 |
30 | Yes | 0.5 | Yes | 0.50 |
31 | No | NaN | No | NaN |
32 | Yes | 10 | No | NaN |
33 | No | NaN | No | NaN |
from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
if array3[i] != -1:
arr3.append(array3[i])
# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)
# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data
Sample Output 2
Hypertension | Hypertension_medicine_years | Diabetes | Diabetes_medicine_years | |
---|---|---|---|---|
0 | Yes | 1 | No | |
1 | Yes | 6 | Yes | 13 |
2 | Yes | VALID | No | |
3 | NaN | NaN | NaN | |
4 | Yes | 2 | No | |
5 | Yes | 6 | No | |
6 | Yes | 14 | Yes | 14 |
7 | Yes | 5 | No | |
8 | Yes | 0.5 | Yes | 2 |
9 | No | NaN | No | |
10 | Yes | VALID | Yes | 3 |
11 | No | NaN | No | |
12 | No | NaN | No | |
13 | Yes | VALID | No | |
14 | No | NaN | No | |
15 | Yes | 2 | No | |
16 | No | NaN | No | |
17 | No | NaN | No | |
18 | Yes | VALID | Yes | 7 |
19 | Yes | VALID | Yes | 5.95 |
20 | No | NaN | Yes | 7 |
21 | No | NaN | No | |
22 | Yes | 4 | No | |
23 | No | NaN | Yes | 3 |
24 | No | NaN | No | |
25 | Yes | 2 | Yes | 5.95 |
26 | Yes | 5 | No | |
27 | Yes | 3 | Yes | 2 |
28 | Yes | 10 | No | |
29 | Yes | 2 | Yes | 8 |
30 | Yes | 0.5 | Yes | 0.5 |
31 | No | NaN | No | |
32 | Yes | 10 | No | |
33 | No | NaN | No |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
where will I assign column1?? if I put data['column1'] in the place of cols[O] its getting error. I want to make it dynamic so that I can compare column2 with any column, don't defined only for zero index column.