Skip to content

Instantly share code, notes, and snippets.

@sancau
Created December 18, 2018 21:44
Show Gist options
  • Save sancau/8cdc1dc768507057e2f70aee39828ac7 to your computer and use it in GitHub Desktop.
Save sancau/8cdc1dc768507057e2f70aee39828ac7 to your computer and use it in GitHub Desktop.
Sparse pd.DataFrame
data = [
['User', 'Content Type', 'File', 'Mb'],
['User 1', 'Text', 'F1', 0.1],
['User 1', 'Text', 'F2', 0.5],
['User 1', 'Video', 'V1', 100],
['User 2', 'Text', 'F1', 2],
['User 2', 'Video', 'V1', 75],
['User 3', 'Video', 'V2', 60],
]
df = pd.DataFrame(data[1:], columns=data[0])
class DFSparser:
def __init__(self, df, group_by_column):
self.df = df
self.group_by_column = group_by_column
def get_mask(self):
mask = pd.concat(
(
pd.concat([
group[1].duplicated()
for group in self.df.groupby(self.group_by_column)[c]
])
for c in self.df.columns.tolist()
),
axis=1,
)
return mask
def apply(self, fill=None):
out = self.df.where(~self.get_mask())
if fill is not None:
return out.fillna(fill)
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment