Skip to content

Instantly share code, notes, and snippets.

@hygull
Last active November 11, 2018 12:11
Show Gist options
  • Save hygull/5a09c4a295a9d3f11cba4aaf0e44363c to your computer and use it in GitHub Desktop.
Save hygull/5a09c4a295a9d3f11cba4aaf0e44363c to your computer and use it in GitHub Desktop.
DataFrame, fillna, skipna, resample
>>> import pandas as pd
>>>
>>> df = pd.DataFrame({'A': range(1, 6)}, index=pd.date_range('2018-12-28', peri
ods=5))
>>>
>>> df2 = pd.DataFrame({'B': range(1, 13, 2)}, index=pd.date_range('2018-12-1',
periods=6))
>>>
>>> df
            A
2018-12-28  1
2018-12-29  2
2018-12-30  3
2018-12-31  4
2019-01-01  5
>>>
>>> df2
             B
2018-12-01   1
2018-12-02   3
2018-12-03   5
2018-12-04   7
2018-12-05   9
2018-12-06  11
>>>
>>> df3 = pd.DataFrame({'items_sold': range(1, 19, 3)}, index=pd.date_range('2018-12-1', periods=6))
>>>
>>> df3
            items_sold
2018-12-01           1
2018-12-02           4
2018-12-03           7
2018-12-04          10
2018-12-05          13
2018-12-06          16
>>>
>>> df3.resample("3D")
DatetimeIndexResampler [freq=<3 * Days>, axis=0, closed=left, label=left, conven
tion=start, base=0]
>>>
>>> df3.resample("3D").pipe(lambda d: d.max() - d.min())
            items_sold
2018-12-01           6
2018-12-04           6
>>>

>>> df3.resample('3D').agg(['sum'])
           items_sold
                  sum
2018-12-01         12
2018-12-04         39
>>>
>>> df3.resample('3D').agg(['mean'])
           items_sold
                 mean
2018-12-01          4
2018-12-04         13
>>>
>>> df3.resample('3D').agg(['std'])
           items_sold
                  std
2018-12-01        3.0
2018-12-04        3.0
>>>
>>> df3.resample('3D').agg(['std', 'mean', 'sum'])
           items_sold
                  std mean sum
2018-12-01        3.0    4  12
2018-12-04        3.0   13  39
>>>
>>> mu = 12 / 3
>>> mu
4.0
>>>
>>> (4 - 1) ** 2 + (4-4) + (4-7)
6
>>> (4 - 1) ** 2 + (4-4) + (4-7) ** 2
18
>>>
>>> 24
24
>>> pow(18/3, 1/2)
2.449489742783178
>>>

Note: the mean deviation is sometimes called the Mean Absolute Deviation (MAD) because it is the mean of the absolute deviations.

>>>
>>> df4 = pd.DataFrame({'a': [3, 6, 6, 7, 8, 11, 15, 16]})
>>> df4
    a
0   3
1   6
2   6
3   7
4   8
5  11
6  15
7  16
>>>
>>> df4.mad()
a    3.75
dtype: float64
>>>
>>> df4.mad(axis=1) # mean absolute deviation
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
dtype: float64
>>>
>>>
>>> df5 = pd.DataFrame({'a': [3, 6, 6, 7, 8, 11, 15, 16, np.nan, np.nan]})
>>> df5
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   NaN
9   NaN
>>>
>>> df4.mad()  # skipna=True
a    3.75
dtype: float64
>>>
>>> df4.mad(skipna=False)
a   NaN
dtype: float64
>>>
>>> df4.fillna(0)
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   0.0
9   0.0
>>>
>>> df4
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   NaN
9   NaN
>>>
>>> df4.fillna(0, inplace=True)
>>> df4
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   0.0
9   0.0
>>>
>>> df4.mad(skipna=False)
a    4.24
dtype: float64
>>>
>>> df4['a'].sum()
72.0
>>>
>>> df4['a'].count()
10
>>> df4['a'].sum(skipna=False)
72.0
>>>
>>> df4['a'].mean()
7.2
>>>
>>> df4['a'].mean(skipna=False)
7.2
>>> df4['a'].mean(skipna=True)
7.2
>>> df4['a'].sum() / 10
7.2
>>> df4['a'].sum() / 8
9.0
>>> df4
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   0.0
9   0.0
>>>
>>> df4
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   0.0
9   0.0
>>>
>>> df5.mean()
a    9.0
dtype: float64
>>>
>>> df5.mean(skipna=False)
a   NaN
dtype: float64
>>>
>>> df5.fillna(0)
      a
0   3.0
1   6.0
2   6.0
3   7.0
4   8.0
5  11.0
6  15.0
7  16.0
8   0.0
9   0.0
>>>
>>> df5.fillna(0, inplace=True)
>>> df5.mean()
a    7.2
dtype: float64
>>>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment