Last active
April 9, 2019 18:58
-
-
Save hamletbatista/0abb6561f297e1068fa3136fb1907260 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def img_size_group(size): | |
| max_size = 50000 | |
| #image size bins | |
| img_size_groups = [i for i in | |
| zip( | |
| [i for i in range(0, max_size, 1000)], | |
| [i for i in range(1000, max_size, 1000)] | |
| ) | |
| ] | |
| for lower, upper in img_size_groups: | |
| if size > max_size: | |
| return str(max_size)+"+" | |
| elif lower < size < upper: | |
| return "{}-{}".format(lower, upper) | |
| img_counts['filesize_group'] = img_counts['filesize'].apply(img_size_group) | |
| #convert image size bins to categories | |
| onehot_img = img_counts[['url']].join(pd.get_dummies(img_counts['filesize_group'], drop_first=True)) | |
| onehot_img = onehot_img.groupby("url").sum().reset_index() | |
| onehot_img = img_counts[["url"]].merge(onehot_img, on="url", how="left").drop_duplicates() | |
| # Each row = URL | |
| # Each column = imagesize bucket | |
| # Value = Count of images from that URL with that imagesize | |
| onehot_img.drop("url", axis=1).head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment