Skip to content

Instantly share code, notes, and snippets.

@hamletbatista
Last active April 9, 2019 18:58
Show Gist options
  • Select an option

  • Save hamletbatista/0abb6561f297e1068fa3136fb1907260 to your computer and use it in GitHub Desktop.

Select an option

Save hamletbatista/0abb6561f297e1068fa3136fb1907260 to your computer and use it in GitHub Desktop.
def img_size_group(size):
max_size = 50000
#image size bins
img_size_groups = [i for i in
zip(
[i for i in range(0, max_size, 1000)],
[i for i in range(1000, max_size, 1000)]
)
]
for lower, upper in img_size_groups:
if size > max_size:
return str(max_size)+"+"
elif lower < size < upper:
return "{}-{}".format(lower, upper)
img_counts['filesize_group'] = img_counts['filesize'].apply(img_size_group)
#convert image size bins to categories
onehot_img = img_counts[['url']].join(pd.get_dummies(img_counts['filesize_group'], drop_first=True))
onehot_img = onehot_img.groupby("url").sum().reset_index()
onehot_img = img_counts[["url"]].merge(onehot_img, on="url", how="left").drop_duplicates()
# Each row = URL
# Each column = imagesize bucket
# Value = Count of images from that URL with that imagesize
onehot_img.drop("url", axis=1).head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment