Created
April 16, 2021 23:21
-
-
Save tmbdev/c7fe7a3241ff6aae7ffe7646697b2f11 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "first-hundred", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "political-controversy", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import webdataset as wds\n", | |
"import dbm\n", | |
"import dbm.ndbm\n", | |
"import shelve\n", | |
"import os\n", | |
"import os.path\n", | |
"import glob\n", | |
"import random" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "photographic-cream", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def load_meta(fname=\"train.json\"):\n", | |
" with open(fname) as stream:\n", | |
" meta = json.load(stream)\n", | |
" categories = {c[\"id\"]: c[\"name\"] for c in meta[\"categories\"]}\n", | |
" images = {x[\"file_name\"]: x for x in meta[\"images\"]}\n", | |
" images_by_id = {x[\"id\"]: x for x in images.values()}\n", | |
" for x in images.values():\n", | |
" x[\"annotations\"] = []\n", | |
" for x in meta[\"annotations\"]:\n", | |
" x[\"category_name\"] = categories[x[\"category_id\"]]\n", | |
" images_by_id[x[\"image_id\"]][\"annotations\"].append(x)\n", | |
" return images\n", | |
"\n", | |
"meta = load_meta()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "living-cheese", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'file_name': 'PMC3866684_00003.jpg', 'height': 811, 'id': 0, 'width': 613, 'annotations': [{'segmentation': [[52.38, 444.87, 291.97, 444.87, 291.97, 456.42, 291.97, 456.42, 291.97, 465.8, 291.97, 465.8, 291.97, 476.27, 291.97, 476.27, 291.97, 487.8, 291.97, 487.8, 291.97, 498.26, 86.41, 498.26, 86.41, 508.73, 40.42, 508.73, 40.42, 497.17, 40.42, 497.17, 40.42, 486.71, 40.42, 486.71, 40.42, 476.27, 40.42, 476.27, 40.42, 466.88, 40.42, 466.88, 40.42, 455.33, 52.38, 455.33, 52.38, 444.87]], 'area': 13787.433082525036, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 444.87, 251.55, 63.86], 'category_id': 1, 'id': 0, 'category_name': 'text'}, {'segmentation': [[309.91, 444.91, 561.46, 444.91, 561.46, 456.42, 345.43, 456.42, 345.43, 466.88, 309.91, 466.88, 309.91, 456.42, 309.91, 456.42, 309.91, 444.91]], 'area': 3266.252633666969, 'iscrowd': 0, 'image_id': 0, 'bbox': [309.91, 444.91, 251.55, 21.97], 'category_id': 1, 'id': 1, 'category_name': 'text'}, {'segmentation': [[52.38, 507.66, 291.97, 507.66, 291.97, 519.19, 291.97, 519.19, 291.97, 529.65, 291.97, 529.65, 291.97, 540.11, 40.42, 540.11, 40.42, 528.56, 40.42, 528.56, 40.42, 518.1, 52.38, 518.1, 52.38, 507.66]], 'area': 8037.095046702656, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 507.66, 251.55, 32.45], 'category_id': 1, 'id': 2, 'category_name': 'text'}, {'segmentation': [[321.87, 465.8, 561.46, 465.8, 561.46, 477.34, 561.46, 477.34, 561.46, 486.71, 561.46, 486.71, 561.46, 498.26, 561.46, 498.26, 561.46, 508.73, 561.46, 508.73, 561.46, 519.19, 561.46, 519.19, 561.46, 529.65, 561.46, 529.65, 561.46, 540.11, 309.91, 540.11, 309.91, 529.65, 309.91, 529.65, 309.91, 518.09, 309.91, 518.09, 309.91, 507.63, 309.91, 507.63, 309.91, 497.18, 309.91, 497.18, 309.91, 487.8, 309.91, 487.8, 309.91, 476.26, 321.87, 476.26, 321.87, 465.8]], 'area': 18566.00016906159, 'iscrowd': 0, 'image_id': 0, 'bbox': [309.91, 465.8, 251.55, 74.3], 'category_id': 1, 'id': 3, 'category_name': 'text'}, {'segmentation': [[40.42, 397.09, 561.44, 397.09, 561.44, 405.57, 561.44, 405.57, 561.44, 414.78, 561.44, 414.78, 561.44, 423.35, 443.73, 423.35, 443.73, 431.92, 40.42, 431.92, 40.42, 422.7, 40.42, 422.7, 40.42, 414.78, 40.42, 414.78, 40.42, 405.57, 40.42, 405.57, 40.42, 397.09]], 'area': 17140.38018086704, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 397.09, 521.02, 34.83], 'category_id': 1, 'id': 4, 'category_name': 'text'}, {'segmentation': [[40.42, 569.85, 561.44, 569.85, 561.44, 579.09, 76.16, 579.09, 76.16, 587.66, 40.42, 587.66, 40.42, 578.53, 40.42, 578.53, 40.42, 569.85]], 'area': 5122.1154595029075, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 569.85, 521.02, 17.81], 'category_id': 1, 'id': 5, 'category_name': 'text'}, {'segmentation': [[40.42, 591.95, 561.47, 591.95, 561.47, 753.25, 40.42, 753.25, 40.42, 591.95]], 'area': 84042.63561600004, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 591.95, 521.05, 161.3], 'category_id': 4, 'id': 6, 'category_name': 'table'}, {'segmentation': [[40.42, 561.19, 63.16, 561.19, 63.16, 570.52, 40.42, 570.52, 40.42, 561.19]], 'area': 212.07894347510592, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 561.19, 22.74, 9.33], 'category_id': 2, 'id': 7, 'category_name': 'title'}, {'segmentation': [[120.94, 55.57, 480.94, 55.57, 480.94, 392.77, 120.94, 392.77, 120.94, 55.57]], 'area': 121394.99978910788, 'iscrowd': 0, 'image_id': 0, 'bbox': [120.94, 55.57, 360.0, 337.21], 'category_id': 5, 'id': 8, 'category_name': 'figure'}]}\n" | |
] | |
} | |
], | |
"source": [ | |
"print(list(meta.values())[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "standard-bahamas", | |
"metadata": { | |
"scrolled": true, | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# writing publaynet-train-000000.tar 0 0.0 GB 0\n", | |
"# writing publaynet-train-000001.tar 985 0.3 GB 985\n", | |
"# writing publaynet-train-000002.tar 988 0.3 GB 1973\n", | |
"# writing publaynet-train-000003.tar 979 0.3 GB 2952\n", | |
"# writing publaynet-train-000004.tar 995 0.3 GB 3947\n", | |
"# writing publaynet-train-000005.tar 976 0.3 GB 4923\n", | |
"# writing publaynet-train-000006.tar 1001 0.3 GB 5924\n", | |
"# writing publaynet-train-000007.tar 989 0.3 GB 6913\n", | |
"# writing publaynet-train-000008.tar 977 0.3 GB 7890\n", | |
"# writing publaynet-train-000009.tar 967 0.3 GB 8857\n", | |
"# writing publaynet-train-000010.tar 988 0.3 GB 9845\n", | |
"# writing publaynet-train-000011.tar 1003 0.3 GB 10848\n", | |
"# writing publaynet-train-000012.tar 985 0.3 GB 11833\n", | |
"# writing publaynet-train-000013.tar 973 0.3 GB 12806\n", | |
"# writing publaynet-train-000014.tar 981 0.3 GB 13787\n", | |
"# writing publaynet-train-000015.tar 981 0.3 GB 14768\n", | |
"# writing publaynet-train-000016.tar 989 0.3 GB 15757\n", | |
"# writing publaynet-train-000017.tar 989 0.3 GB 16746\n", | |
"# writing publaynet-train-000018.tar 1000 0.3 GB 17746\n", | |
"# writing publaynet-train-000019.tar 981 0.3 GB 18727\n", | |
"# writing publaynet-train-000020.tar 1008 0.3 GB 19735\n", | |
"# writing publaynet-train-000021.tar 979 0.3 GB 20714\n", | |
"# writing publaynet-train-000022.tar 982 0.3 GB 21696\n", | |
"# writing publaynet-train-000023.tar 994 0.3 GB 22690\n", | |
"# writing publaynet-train-000024.tar 989 0.3 GB 23679\n", | |
"# writing publaynet-train-000025.tar 985 0.3 GB 24664\n", | |
"# writing publaynet-train-000026.tar 991 0.3 GB 25655\n", | |
"# writing publaynet-train-000027.tar 989 0.3 GB 26644\n", | |
"# writing publaynet-train-000028.tar 985 0.3 GB 27629\n", | |
"# writing publaynet-train-000029.tar 1002 0.3 GB 28631\n", | |
"# writing publaynet-train-000030.tar 976 0.3 GB 29607\n", | |
"# writing publaynet-train-000031.tar 989 0.3 GB 30596\n", | |
"# writing publaynet-train-000032.tar 994 0.3 GB 31590\n", | |
"# writing publaynet-train-000033.tar 988 0.3 GB 32578\n", | |
"# writing publaynet-train-000034.tar 979 0.3 GB 33557\n", | |
"# writing publaynet-train-000035.tar 974 0.3 GB 34531\n", | |
"# writing publaynet-train-000036.tar 992 0.3 GB 35523\n", | |
"# writing publaynet-train-000037.tar 981 0.3 GB 36504\n", | |
"# writing publaynet-train-000038.tar 986 0.3 GB 37490\n", | |
"# writing publaynet-train-000039.tar 988 0.3 GB 38478\n", | |
"# writing publaynet-train-000040.tar 981 0.3 GB 39459\n", | |
"# writing publaynet-train-000041.tar 988 0.3 GB 40447\n", | |
"# writing publaynet-train-000042.tar 1000 0.3 GB 41447\n", | |
"# writing publaynet-train-000043.tar 982 0.3 GB 42429\n", | |
"# writing publaynet-train-000044.tar 980 0.3 GB 43409\n", | |
"# writing publaynet-train-000045.tar 985 0.3 GB 44394\n", | |
"# writing publaynet-train-000046.tar 980 0.3 GB 45374\n", | |
"# writing publaynet-train-000047.tar 976 0.3 GB 46350\n", | |
"# writing publaynet-train-000048.tar 985 0.3 GB 47335\n", | |
"# writing publaynet-train-000049.tar 991 0.3 GB 48326\n", | |
"# writing publaynet-train-000050.tar 1001 0.3 GB 49327\n", | |
"# writing publaynet-train-000051.tar 978 0.3 GB 50305\n", | |
"# writing publaynet-train-000052.tar 962 0.3 GB 51267\n", | |
"# writing publaynet-train-000053.tar 998 0.3 GB 52265\n", | |
"# writing publaynet-train-000054.tar 983 0.3 GB 53248\n", | |
"# writing publaynet-train-000055.tar 1000 0.3 GB 54248\n", | |
"# writing publaynet-train-000056.tar 979 0.3 GB 55227\n", | |
"# writing publaynet-train-000057.tar 987 0.3 GB 56214\n", | |
"# writing publaynet-train-000058.tar 974 0.3 GB 57188\n", | |
"# writing publaynet-train-000059.tar 978 0.3 GB 58166\n", | |
"# writing publaynet-train-000060.tar 980 0.3 GB 59146\n", | |
"# writing publaynet-train-000061.tar 1007 0.3 GB 60153\n", | |
"# writing publaynet-train-000062.tar 990 0.3 GB 61143\n", | |
"# writing publaynet-train-000063.tar 993 0.3 GB 62136\n", | |
"# writing publaynet-train-000064.tar 983 0.3 GB 63119\n", | |
"# writing publaynet-train-000065.tar 981 0.3 GB 64100\n", | |
"# writing publaynet-train-000066.tar 974 0.3 GB 65074\n", | |
"# writing publaynet-train-000067.tar 981 0.3 GB 66055\n", | |
"# writing publaynet-train-000068.tar 986 0.3 GB 67041\n", | |
"# writing publaynet-train-000069.tar 994 0.3 GB 68035\n", | |
"# writing publaynet-train-000070.tar 970 0.3 GB 69005\n", | |
"# writing publaynet-train-000071.tar 980 0.3 GB 69985\n", | |
"# writing publaynet-train-000072.tar 984 0.3 GB 70969\n", | |
"# writing publaynet-train-000073.tar 989 0.3 GB 71958\n", | |
"# writing publaynet-train-000074.tar 986 0.3 GB 72944\n", | |
"# writing publaynet-train-000075.tar 974 0.3 GB 73918\n", | |
"# writing publaynet-train-000076.tar 997 0.3 GB 74915\n", | |
"# writing publaynet-train-000077.tar 981 0.3 GB 75896\n", | |
"# writing publaynet-train-000078.tar 977 0.3 GB 76873\n", | |
"# writing publaynet-train-000079.tar 995 0.3 GB 77868\n", | |
"# writing publaynet-train-000080.tar 983 0.3 GB 78851\n", | |
"# writing publaynet-train-000081.tar 981 0.3 GB 79832\n", | |
"# writing publaynet-train-000082.tar 985 0.3 GB 80817\n", | |
"# writing publaynet-train-000083.tar 985 0.3 GB 81802\n", | |
"# writing publaynet-train-000084.tar 993 0.3 GB 82795\n", | |
"# writing publaynet-train-000085.tar 988 0.3 GB 83783\n", | |
"# writing publaynet-train-000086.tar 994 0.3 GB 84777\n", | |
"# writing publaynet-train-000087.tar 983 0.3 GB 85760\n", | |
"# writing publaynet-train-000088.tar 982 0.3 GB 86742\n", | |
"# writing publaynet-train-000089.tar 988 0.3 GB 87730\n", | |
"# writing publaynet-train-000090.tar 986 0.3 GB 88716\n", | |
"# writing publaynet-train-000091.tar 983 0.3 GB 89699\n", | |
"# writing publaynet-train-000092.tar 987 0.3 GB 90686\n", | |
"# writing publaynet-train-000093.tar 980 0.3 GB 91666\n", | |
"# writing publaynet-train-000094.tar 976 0.3 GB 92642\n", | |
"# writing publaynet-train-000095.tar 987 0.3 GB 93629\n", | |
"# writing publaynet-train-000096.tar 992 0.3 GB 94621\n", | |
"# writing publaynet-train-000097.tar 978 0.3 GB 95599\n", | |
"# writing publaynet-train-000098.tar 989 0.3 GB 96588\n", | |
"# writing publaynet-train-000099.tar 970 0.3 GB 97558\n", | |
"# writing publaynet-train-000100.tar 992 0.3 GB 98550\n", | |
"# writing publaynet-train-000101.tar 998 0.3 GB 99548\n", | |
"# writing publaynet-train-000102.tar 986 0.3 GB 100534\n", | |
"# writing publaynet-train-000103.tar 993 0.3 GB 101527\n", | |
"# writing publaynet-train-000104.tar 974 0.3 GB 102501\n", | |
"# writing publaynet-train-000105.tar 971 0.3 GB 103472\n", | |
"# writing publaynet-train-000106.tar 982 0.3 GB 104454\n", | |
"# writing publaynet-train-000107.tar 985 0.3 GB 105439\n", | |
"# writing publaynet-train-000108.tar 982 0.3 GB 106421\n", | |
"# writing publaynet-train-000109.tar 983 0.3 GB 107404\n", | |
"# writing publaynet-train-000110.tar 998 0.3 GB 108402\n", | |
"# writing publaynet-train-000111.tar 981 0.3 GB 109383\n", | |
"# writing publaynet-train-000112.tar 980 0.3 GB 110363\n", | |
"# writing publaynet-train-000113.tar 984 0.3 GB 111347\n", | |
"# writing publaynet-train-000114.tar 976 0.3 GB 112323\n", | |
"# writing publaynet-train-000115.tar 971 0.3 GB 113294\n", | |
"# writing publaynet-train-000116.tar 993 0.3 GB 114287\n", | |
"# writing publaynet-train-000117.tar 988 0.3 GB 115275\n", | |
"# writing publaynet-train-000118.tar 988 0.3 GB 116263\n", | |
"# writing publaynet-train-000119.tar 977 0.3 GB 117240\n", | |
"# writing publaynet-train-000120.tar 994 0.3 GB 118234\n", | |
"# writing publaynet-train-000121.tar 984 0.3 GB 119218\n", | |
"# writing publaynet-train-000122.tar 985 0.3 GB 120203\n", | |
"# writing publaynet-train-000123.tar 990 0.3 GB 121193\n", | |
"# writing publaynet-train-000124.tar 977 0.3 GB 122170\n", | |
"# writing publaynet-train-000125.tar 1003 0.3 GB 123173\n", | |
"# writing publaynet-train-000126.tar 992 0.3 GB 124165\n", | |
"# writing publaynet-train-000127.tar 1005 0.3 GB 125170\n", | |
"# writing publaynet-train-000128.tar 990 0.3 GB 126160\n", | |
"# writing publaynet-train-000129.tar 968 0.3 GB 127128\n", | |
"# writing publaynet-train-000130.tar 1002 0.3 GB 128130\n", | |
"# writing publaynet-train-000131.tar 985 0.3 GB 129115\n", | |
"# writing publaynet-train-000132.tar 983 0.3 GB 130098\n", | |
"# writing publaynet-train-000133.tar 994 0.3 GB 131092\n", | |
"# writing publaynet-train-000134.tar 989 0.3 GB 132081\n", | |
"# writing publaynet-train-000135.tar 996 0.3 GB 133077\n", | |
"# writing publaynet-train-000136.tar 990 0.3 GB 134067\n", | |
"# writing publaynet-train-000137.tar 982 0.3 GB 135049\n", | |
"# writing publaynet-train-000138.tar 975 0.3 GB 136024\n", | |
"# writing publaynet-train-000139.tar 993 0.3 GB 137017\n", | |
"# writing publaynet-train-000140.tar 986 0.3 GB 138003\n", | |
"# writing publaynet-train-000141.tar 990 0.3 GB 138993\n", | |
"# writing publaynet-train-000142.tar 995 0.3 GB 139988\n", | |
"# writing publaynet-train-000143.tar 976 0.3 GB 140964\n", | |
"# writing publaynet-train-000144.tar 1009 0.3 GB 141973\n", | |
"# writing publaynet-train-000145.tar 981 0.3 GB 142954\n", | |
"# writing publaynet-train-000146.tar 990 0.3 GB 143944\n", | |
"# writing publaynet-train-000147.tar 984 0.3 GB 144928\n", | |
"# writing publaynet-train-000148.tar 981 0.3 GB 145909\n", | |
"# writing publaynet-train-000149.tar 982 0.3 GB 146891\n", | |
"# writing publaynet-train-000150.tar 979 0.3 GB 147870\n", | |
"# writing publaynet-train-000151.tar 991 0.3 GB 148861\n", | |
"# writing publaynet-train-000152.tar 972 0.3 GB 149833\n", | |
"# writing publaynet-train-000153.tar 992 0.3 GB 150825\n", | |
"# writing publaynet-train-000154.tar 998 0.3 GB 151823\n", | |
"# writing publaynet-train-000155.tar 989 0.3 GB 152812\n", | |
"# writing publaynet-train-000156.tar 987 0.3 GB 153799\n", | |
"# writing publaynet-train-000157.tar 962 0.3 GB 154761\n", | |
"# writing publaynet-train-000158.tar 1009 0.3 GB 155770\n", | |
"# writing publaynet-train-000159.tar 977 0.3 GB 156747\n", | |
"# writing publaynet-train-000160.tar 993 0.3 GB 157740\n", | |
"# writing publaynet-train-000161.tar 993 0.3 GB 158733\n", | |
"# writing publaynet-train-000162.tar 988 0.3 GB 159721\n", | |
"# writing publaynet-train-000163.tar 982 0.3 GB 160703\n", | |
"# writing publaynet-train-000164.tar 985 0.3 GB 161688\n", | |
"# writing publaynet-train-000165.tar 978 0.3 GB 162666\n", | |
"# writing publaynet-train-000166.tar 992 0.3 GB 163658\n", | |
"# writing publaynet-train-000167.tar 985 0.3 GB 164643\n", | |
"# writing publaynet-train-000168.tar 970 0.3 GB 165613\n", | |
"# writing publaynet-train-000169.tar 998 0.3 GB 166611\n", | |
"# writing publaynet-train-000170.tar 977 0.3 GB 167588\n", | |
"# writing publaynet-train-000171.tar 992 0.3 GB 168580\n", | |
"# writing publaynet-train-000172.tar 991 0.3 GB 169571\n", | |
"# writing publaynet-train-000173.tar 981 0.3 GB 170552\n", | |
"# writing publaynet-train-000174.tar 990 0.3 GB 171542\n", | |
"# writing publaynet-train-000175.tar 988 0.3 GB 172530\n", | |
"# writing publaynet-train-000176.tar 977 0.3 GB 173507\n", | |
"# writing publaynet-train-000177.tar 981 0.3 GB 174488\n", | |
"# writing publaynet-train-000178.tar 991 0.3 GB 175479\n", | |
"# writing publaynet-train-000179.tar 990 0.3 GB 176469\n", | |
"# writing publaynet-train-000180.tar 987 0.3 GB 177456\n", | |
"# writing publaynet-train-000181.tar 998 0.3 GB 178454\n", | |
"# writing publaynet-train-000182.tar 991 0.3 GB 179445\n", | |
"# writing publaynet-train-000183.tar 985 0.3 GB 180430\n", | |
"# writing publaynet-train-000184.tar 977 0.3 GB 181407\n", | |
"# writing publaynet-train-000185.tar 992 0.3 GB 182399\n", | |
"# writing publaynet-train-000186.tar 988 0.3 GB 183387\n", | |
"# writing publaynet-train-000187.tar 983 0.3 GB 184370\n", | |
"# writing publaynet-train-000188.tar 976 0.3 GB 185346\n", | |
"# writing publaynet-train-000189.tar 975 0.3 GB 186321\n", | |
"# writing publaynet-train-000190.tar 977 0.3 GB 187298\n", | |
"# writing publaynet-train-000191.tar 982 0.3 GB 188280\n", | |
"# writing publaynet-train-000192.tar 991 0.3 GB 189271\n", | |
"# writing publaynet-train-000193.tar 987 0.3 GB 190258\n", | |
"# writing publaynet-train-000194.tar 980 0.3 GB 191238\n", | |
"# writing publaynet-train-000195.tar 973 0.3 GB 192211\n", | |
"# writing publaynet-train-000196.tar 991 0.3 GB 193202\n", | |
"# writing publaynet-train-000197.tar 976 0.3 GB 194178\n", | |
"# writing publaynet-train-000198.tar 989 0.3 GB 195167\n", | |
"# writing publaynet-train-000199.tar 984 0.3 GB 196151\n", | |
"# writing publaynet-train-000200.tar 993 0.3 GB 197144\n", | |
"# writing publaynet-train-000201.tar 983 0.3 GB 198127\n", | |
"# writing publaynet-train-000202.tar 972 0.3 GB 199099\n", | |
"# writing publaynet-train-000203.tar 990 0.3 GB 200089\n", | |
"# writing publaynet-train-000204.tar 989 0.3 GB 201078\n", | |
"# writing publaynet-train-000205.tar 999 0.3 GB 202077\n", | |
"# writing publaynet-train-000206.tar 981 0.3 GB 203058\n", | |
"# writing publaynet-train-000207.tar 994 0.3 GB 204052\n", | |
"# writing publaynet-train-000208.tar 983 0.3 GB 205035\n", | |
"# writing publaynet-train-000209.tar 981 0.3 GB 206016\n", | |
"# writing publaynet-train-000210.tar 963 0.3 GB 206979\n", | |
"# writing publaynet-train-000211.tar 984 0.3 GB 207963\n", | |
"# writing publaynet-train-000212.tar 974 0.3 GB 208937\n", | |
"# writing publaynet-train-000213.tar 993 0.3 GB 209930\n", | |
"# writing publaynet-train-000214.tar 989 0.3 GB 210919\n", | |
"# writing publaynet-train-000215.tar 983 0.3 GB 211902\n", | |
"# writing publaynet-train-000216.tar 969 0.3 GB 212871\n", | |
"# writing publaynet-train-000217.tar 974 0.3 GB 213845\n", | |
"# writing publaynet-train-000218.tar 976 0.3 GB 214821\n", | |
"# writing publaynet-train-000219.tar 982 0.3 GB 215803\n", | |
"# writing publaynet-train-000220.tar 975 0.3 GB 216778\n", | |
"# writing publaynet-train-000221.tar 990 0.3 GB 217768\n", | |
"# writing publaynet-train-000222.tar 993 0.3 GB 218761\n", | |
"# writing publaynet-train-000223.tar 981 0.3 GB 219742\n", | |
"# writing publaynet-train-000224.tar 977 0.3 GB 220719\n", | |
"# writing publaynet-train-000225.tar 986 0.3 GB 221705\n", | |
"# writing publaynet-train-000226.tar 1000 0.3 GB 222705\n", | |
"# writing publaynet-train-000227.tar 984 0.3 GB 223689\n", | |
"# writing publaynet-train-000228.tar 986 0.3 GB 224675\n", | |
"# writing publaynet-train-000229.tar 983 0.3 GB 225658\n", | |
"# writing publaynet-train-000230.tar 977 0.3 GB 226635\n", | |
"# writing publaynet-train-000231.tar 980 0.3 GB 227615\n", | |
"# writing publaynet-train-000232.tar 990 0.3 GB 228605\n", | |
"# writing publaynet-train-000233.tar 990 0.3 GB 229595\n", | |
"# writing publaynet-train-000234.tar 985 0.3 GB 230580\n", | |
"# writing publaynet-train-000235.tar 992 0.3 GB 231572\n", | |
"# writing publaynet-train-000236.tar 997 0.3 GB 232569\n", | |
"# writing publaynet-train-000237.tar 988 0.3 GB 233557\n", | |
"# writing publaynet-train-000238.tar 990 0.3 GB 234547\n", | |
"# writing publaynet-train-000239.tar 1001 0.3 GB 235548\n", | |
"# writing publaynet-train-000240.tar 960 0.3 GB 236508\n", | |
"# writing publaynet-train-000241.tar 987 0.3 GB 237495\n", | |
"# writing publaynet-train-000242.tar 982 0.3 GB 238477\n", | |
"# writing publaynet-train-000243.tar 985 0.3 GB 239462\n", | |
"# writing publaynet-train-000244.tar 977 0.3 GB 240439\n", | |
"# writing publaynet-train-000245.tar 996 0.3 GB 241435\n", | |
"# writing publaynet-train-000246.tar 980 0.3 GB 242415\n", | |
"# writing publaynet-train-000247.tar 1001 0.3 GB 243416\n", | |
"# writing publaynet-train-000248.tar 999 0.3 GB 244415\n", | |
"# writing publaynet-train-000249.tar 992 0.3 GB 245407\n", | |
"# writing publaynet-train-000250.tar 1007 0.3 GB 246414\n", | |
"# writing publaynet-train-000251.tar 997 0.3 GB 247411\n", | |
"# writing publaynet-train-000252.tar 1002 0.3 GB 248413\n", | |
"# writing publaynet-train-000253.tar 993 0.3 GB 249406\n", | |
"# writing publaynet-train-000254.tar 977 0.3 GB 250383\n", | |
"# writing publaynet-train-000255.tar 983 0.3 GB 251366\n", | |
"# writing publaynet-train-000256.tar 977 0.3 GB 252343\n", | |
"# writing publaynet-train-000257.tar 1000 0.3 GB 253343\n", | |
"# writing publaynet-train-000258.tar 1005 0.3 GB 254348\n", | |
"# writing publaynet-train-000259.tar 981 0.3 GB 255329\n", | |
"# writing publaynet-train-000260.tar 984 0.3 GB 256313\n", | |
"# writing publaynet-train-000261.tar 969 0.3 GB 257282\n", | |
"# writing publaynet-train-000262.tar 974 0.3 GB 258256\n", | |
"# writing publaynet-train-000263.tar 990 0.3 GB 259246\n", | |
"# writing publaynet-train-000264.tar 994 0.3 GB 260240\n", | |
"# writing publaynet-train-000265.tar 973 0.3 GB 261213\n", | |
"# writing publaynet-train-000266.tar 999 0.3 GB 262212\n", | |
"# writing publaynet-train-000267.tar 995 0.3 GB 263207\n", | |
"# writing publaynet-train-000268.tar 995 0.3 GB 264202\n", | |
"# writing publaynet-train-000269.tar 981 0.3 GB 265183\n", | |
"# writing publaynet-train-000270.tar 1002 0.3 GB 266185\n", | |
"# writing publaynet-train-000271.tar 977 0.3 GB 267162\n", | |
"# writing publaynet-train-000272.tar 968 0.3 GB 268130\n", | |
"# writing publaynet-train-000273.tar 991 0.3 GB 269121\n", | |
"# writing publaynet-train-000274.tar 988 0.3 GB 270109\n", | |
"# writing publaynet-train-000275.tar 984 0.3 GB 271093\n", | |
"# writing publaynet-train-000276.tar 995 0.3 GB 272088\n", | |
"# writing publaynet-train-000277.tar 989 0.3 GB 273077\n", | |
"# writing publaynet-train-000278.tar 997 0.3 GB 274074\n", | |
"# writing publaynet-train-000279.tar 986 0.3 GB 275060\n", | |
"# writing publaynet-train-000280.tar 983 0.3 GB 276043\n", | |
"# writing publaynet-train-000281.tar 986 0.3 GB 277029\n", | |
"# writing publaynet-train-000282.tar 978 0.3 GB 278007\n", | |
"# writing publaynet-train-000283.tar 982 0.3 GB 278989\n", | |
"# writing publaynet-train-000284.tar 979 0.3 GB 279968\n", | |
"# writing publaynet-train-000285.tar 984 0.3 GB 280952\n", | |
"# writing publaynet-train-000286.tar 979 0.3 GB 281931\n", | |
"# writing publaynet-train-000287.tar 997 0.3 GB 282928\n", | |
"# writing publaynet-train-000288.tar 992 0.3 GB 283920\n", | |
"# writing publaynet-train-000289.tar 1004 0.3 GB 284924\n", | |
"# writing publaynet-train-000290.tar 988 0.3 GB 285912\n", | |
"# writing publaynet-train-000291.tar 994 0.3 GB 286906\n", | |
"# writing publaynet-train-000292.tar 980 0.3 GB 287886\n", | |
"# writing publaynet-train-000293.tar 998 0.3 GB 288884\n", | |
"# writing publaynet-train-000294.tar 1002 0.3 GB 289886\n", | |
"# writing publaynet-train-000295.tar 992 0.3 GB 290878\n", | |
"# writing publaynet-train-000296.tar 989 0.3 GB 291867\n", | |
"# writing publaynet-train-000297.tar 993 0.3 GB 292860\n", | |
"# writing publaynet-train-000298.tar 981 0.3 GB 293841\n", | |
"# writing publaynet-train-000299.tar 999 0.3 GB 294840\n", | |
"# writing publaynet-train-000300.tar 990 0.3 GB 295830\n", | |
"# writing publaynet-train-000301.tar 974 0.3 GB 296804\n", | |
"# writing publaynet-train-000302.tar 989 0.3 GB 297793\n", | |
"# writing publaynet-train-000303.tar 978 0.3 GB 298771\n", | |
"# writing publaynet-train-000304.tar 992 0.3 GB 299763\n", | |
"# writing publaynet-train-000305.tar 989 0.3 GB 300752\n", | |
"# writing publaynet-train-000306.tar 990 0.3 GB 301742\n", | |
"# writing publaynet-train-000307.tar 981 0.3 GB 302723\n", | |
"# writing publaynet-train-000308.tar 998 0.3 GB 303721\n", | |
"# writing publaynet-train-000309.tar 989 0.3 GB 304710\n", | |
"# writing publaynet-train-000310.tar 985 0.3 GB 305695\n", | |
"# writing publaynet-train-000311.tar 974 0.3 GB 306669\n", | |
"# writing publaynet-train-000312.tar 997 0.3 GB 307666\n", | |
"# writing publaynet-train-000313.tar 974 0.3 GB 308640\n", | |
"# writing publaynet-train-000314.tar 968 0.3 GB 309608\n", | |
"# writing publaynet-train-000315.tar 1008 0.3 GB 310616\n", | |
"# writing publaynet-train-000316.tar 998 0.3 GB 311614\n", | |
"# writing publaynet-train-000317.tar 992 0.3 GB 312606\n", | |
"# writing publaynet-train-000318.tar 980 0.3 GB 313586\n", | |
"# writing publaynet-train-000319.tar 987 0.3 GB 314573\n", | |
"# writing publaynet-train-000320.tar 978 0.3 GB 315551\n", | |
"# writing publaynet-train-000321.tar 994 0.3 GB 316545\n", | |
"# writing publaynet-train-000322.tar 985 0.3 GB 317530\n", | |
"# writing publaynet-train-000323.tar 984 0.3 GB 318514\n", | |
"# writing publaynet-train-000324.tar 989 0.3 GB 319503\n", | |
"# writing publaynet-train-000325.tar 977 0.3 GB 320480\n", | |
"# writing publaynet-train-000326.tar 1000 0.3 GB 321480\n", | |
"# writing publaynet-train-000327.tar 988 0.3 GB 322468\n", | |
"# writing publaynet-train-000328.tar 993 0.3 GB 323461\n", | |
"# writing publaynet-train-000329.tar 994 0.3 GB 324455\n", | |
"# writing publaynet-train-000330.tar 979 0.3 GB 325434\n", | |
"# writing publaynet-train-000331.tar 994 0.3 GB 326428\n", | |
"# writing publaynet-train-000332.tar 1004 0.3 GB 327432\n", | |
"# writing publaynet-train-000333.tar 988 0.3 GB 328420\n", | |
"# writing publaynet-train-000334.tar 980 0.3 GB 329400\n", | |
"# writing publaynet-train-000335.tar 990 0.3 GB 330390\n", | |
"# writing publaynet-train-000336.tar 988 0.3 GB 331378\n", | |
"# writing publaynet-train-000337.tar 968 0.3 GB 332346\n", | |
"# writing publaynet-train-000338.tar 987 0.3 GB 333333\n", | |
"# writing publaynet-train-000339.tar 985 0.3 GB 334318\n", | |
"# writing publaynet-train-000340.tar 977 0.3 GB 335295\n" | |
] | |
} | |
], | |
"source": [ | |
"sink = wds.ShardWriter(\"publaynet-train-%06d.tar\", maxsize=3e8)\n", | |
"images = glob.glob(\"train/*\")\n", | |
"random.shuffle(images)\n", | |
"for path in images:\n", | |
" base = os.path.basename(path)\n", | |
" root = os.path.splitext(base)[0]\n", | |
" with open(path, \"rb\") as stream:\n", | |
" png = stream.read()\n", | |
" json = meta[base]\n", | |
" sample = dict(__key__=root, png=png, json=json)\n", | |
" sink.write(sample)\n", | |
"sink.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "respective-reality", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# writing publaynet-val-000000.tar 0 0.0 GB 0\n", | |
"# writing publaynet-val-000001.tar 967 0.3 GB 967\n", | |
"# writing publaynet-val-000002.tar 966 0.3 GB 1933\n", | |
"# writing publaynet-val-000003.tar 973 0.3 GB 2906\n", | |
"# writing publaynet-val-000004.tar 974 0.3 GB 3880\n", | |
"# writing publaynet-val-000005.tar 966 0.3 GB 4846\n", | |
"# writing publaynet-val-000006.tar 971 0.3 GB 5817\n", | |
"# writing publaynet-val-000007.tar 970 0.3 GB 6787\n", | |
"# writing publaynet-val-000008.tar 960 0.3 GB 7747\n", | |
"# writing publaynet-val-000009.tar 964 0.3 GB 8711\n", | |
"# writing publaynet-val-000010.tar 972 0.3 GB 9683\n", | |
"# writing publaynet-val-000011.tar 953 0.3 GB 10636\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"\n", | |
"meta = load_meta(\"val.json\")\n", | |
"\n", | |
"sink = wds.ShardWriter(\"publaynet-val-%06d.tar\", maxsize=3e8)\n", | |
"images = glob.glob(\"val/*\")\n", | |
"random.shuffle(images)\n", | |
"for path in images:\n", | |
" base = os.path.basename(path)\n", | |
" root = os.path.splitext(base)[0]\n", | |
" with open(path, \"rb\") as stream:\n", | |
" png = stream.read()\n", | |
" info = meta[base]\n", | |
" sample = dict(__key__=root, png=png, json=info)\n", | |
" sink.write(sample)\n", | |
"sink.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "thermal-produce", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# writing publaynet-test-000000.tar 0 0.0 GB 0\n", | |
"# writing publaynet-test-000001.tar 1014 0.3 GB 1014\n", | |
"# writing publaynet-test-000002.tar 1013 0.3 GB 2027\n", | |
"# writing publaynet-test-000003.tar 1009 0.3 GB 3036\n", | |
"# writing publaynet-test-000004.tar 1008 0.3 GB 4044\n", | |
"# writing publaynet-test-000005.tar 1023 0.3 GB 5067\n", | |
"# writing publaynet-test-000006.tar 1006 0.3 GB 6073\n", | |
"# writing publaynet-test-000007.tar 1010 0.3 GB 7083\n", | |
"# writing publaynet-test-000008.tar 1008 0.3 GB 8091\n", | |
"# writing publaynet-test-000009.tar 992 0.3 GB 9083\n", | |
"# writing publaynet-test-000010.tar 1000 0.3 GB 10083\n", | |
"# writing publaynet-test-000011.tar 1006 0.3 GB 11089\n" | |
] | |
} | |
], | |
"source": [ | |
"sink = wds.ShardWriter(\"publaynet-test-%06d.tar\", maxsize=3e8)\n", | |
"images = glob.glob(\"test/*\")\n", | |
"random.shuffle(images)\n", | |
"for path in images:\n", | |
" base = os.path.basename(path)\n", | |
" root = os.path.splitext(base)[0]\n", | |
" with open(path, \"rb\") as stream:\n", | |
" png = stream.read()\n", | |
" #json = db[base]\n", | |
" sample = dict(__key__=root, png=png)\n", | |
" sink.write(sample)\n", | |
"sink.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "binary-creation", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment