Skip to content

Instantly share code, notes, and snippets.

@tmbdev
Created April 16, 2021 23:21
Show Gist options
  • Save tmbdev/c7fe7a3241ff6aae7ffe7646697b2f11 to your computer and use it in GitHub Desktop.
Save tmbdev/c7fe7a3241ff6aae7ffe7646697b2f11 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "first-hundred",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "political-controversy",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import webdataset as wds\n",
"import dbm\n",
"import dbm.ndbm\n",
"import shelve\n",
"import os\n",
"import os.path\n",
"import glob\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "photographic-cream",
"metadata": {},
"outputs": [],
"source": [
"def load_meta(fname=\"train.json\"):\n",
" with open(fname) as stream:\n",
" meta = json.load(stream)\n",
" categories = {c[\"id\"]: c[\"name\"] for c in meta[\"categories\"]}\n",
" images = {x[\"file_name\"]: x for x in meta[\"images\"]}\n",
" images_by_id = {x[\"id\"]: x for x in images.values()}\n",
" for x in images.values():\n",
" x[\"annotations\"] = []\n",
" for x in meta[\"annotations\"]:\n",
" x[\"category_name\"] = categories[x[\"category_id\"]]\n",
" images_by_id[x[\"image_id\"]][\"annotations\"].append(x)\n",
" return images\n",
"\n",
"meta = load_meta()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "living-cheese",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'file_name': 'PMC3866684_00003.jpg', 'height': 811, 'id': 0, 'width': 613, 'annotations': [{'segmentation': [[52.38, 444.87, 291.97, 444.87, 291.97, 456.42, 291.97, 456.42, 291.97, 465.8, 291.97, 465.8, 291.97, 476.27, 291.97, 476.27, 291.97, 487.8, 291.97, 487.8, 291.97, 498.26, 86.41, 498.26, 86.41, 508.73, 40.42, 508.73, 40.42, 497.17, 40.42, 497.17, 40.42, 486.71, 40.42, 486.71, 40.42, 476.27, 40.42, 476.27, 40.42, 466.88, 40.42, 466.88, 40.42, 455.33, 52.38, 455.33, 52.38, 444.87]], 'area': 13787.433082525036, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 444.87, 251.55, 63.86], 'category_id': 1, 'id': 0, 'category_name': 'text'}, {'segmentation': [[309.91, 444.91, 561.46, 444.91, 561.46, 456.42, 345.43, 456.42, 345.43, 466.88, 309.91, 466.88, 309.91, 456.42, 309.91, 456.42, 309.91, 444.91]], 'area': 3266.252633666969, 'iscrowd': 0, 'image_id': 0, 'bbox': [309.91, 444.91, 251.55, 21.97], 'category_id': 1, 'id': 1, 'category_name': 'text'}, {'segmentation': [[52.38, 507.66, 291.97, 507.66, 291.97, 519.19, 291.97, 519.19, 291.97, 529.65, 291.97, 529.65, 291.97, 540.11, 40.42, 540.11, 40.42, 528.56, 40.42, 528.56, 40.42, 518.1, 52.38, 518.1, 52.38, 507.66]], 'area': 8037.095046702656, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 507.66, 251.55, 32.45], 'category_id': 1, 'id': 2, 'category_name': 'text'}, {'segmentation': [[321.87, 465.8, 561.46, 465.8, 561.46, 477.34, 561.46, 477.34, 561.46, 486.71, 561.46, 486.71, 561.46, 498.26, 561.46, 498.26, 561.46, 508.73, 561.46, 508.73, 561.46, 519.19, 561.46, 519.19, 561.46, 529.65, 561.46, 529.65, 561.46, 540.11, 309.91, 540.11, 309.91, 529.65, 309.91, 529.65, 309.91, 518.09, 309.91, 518.09, 309.91, 507.63, 309.91, 507.63, 309.91, 497.18, 309.91, 497.18, 309.91, 487.8, 309.91, 487.8, 309.91, 476.26, 321.87, 476.26, 321.87, 465.8]], 'area': 18566.00016906159, 'iscrowd': 0, 'image_id': 0, 'bbox': [309.91, 465.8, 251.55, 74.3], 'category_id': 1, 'id': 3, 'category_name': 'text'}, {'segmentation': [[40.42, 397.09, 561.44, 397.09, 561.44, 405.57, 561.44, 405.57, 561.44, 414.78, 561.44, 414.78, 561.44, 423.35, 443.73, 423.35, 443.73, 431.92, 40.42, 431.92, 40.42, 422.7, 40.42, 422.7, 40.42, 414.78, 40.42, 414.78, 40.42, 405.57, 40.42, 405.57, 40.42, 397.09]], 'area': 17140.38018086704, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 397.09, 521.02, 34.83], 'category_id': 1, 'id': 4, 'category_name': 'text'}, {'segmentation': [[40.42, 569.85, 561.44, 569.85, 561.44, 579.09, 76.16, 579.09, 76.16, 587.66, 40.42, 587.66, 40.42, 578.53, 40.42, 578.53, 40.42, 569.85]], 'area': 5122.1154595029075, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 569.85, 521.02, 17.81], 'category_id': 1, 'id': 5, 'category_name': 'text'}, {'segmentation': [[40.42, 591.95, 561.47, 591.95, 561.47, 753.25, 40.42, 753.25, 40.42, 591.95]], 'area': 84042.63561600004, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 591.95, 521.05, 161.3], 'category_id': 4, 'id': 6, 'category_name': 'table'}, {'segmentation': [[40.42, 561.19, 63.16, 561.19, 63.16, 570.52, 40.42, 570.52, 40.42, 561.19]], 'area': 212.07894347510592, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 561.19, 22.74, 9.33], 'category_id': 2, 'id': 7, 'category_name': 'title'}, {'segmentation': [[120.94, 55.57, 480.94, 55.57, 480.94, 392.77, 120.94, 392.77, 120.94, 55.57]], 'area': 121394.99978910788, 'iscrowd': 0, 'image_id': 0, 'bbox': [120.94, 55.57, 360.0, 337.21], 'category_id': 5, 'id': 8, 'category_name': 'figure'}]}\n"
]
}
],
"source": [
"print(list(meta.values())[0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "standard-bahamas",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# writing publaynet-train-000000.tar 0 0.0 GB 0\n",
"# writing publaynet-train-000001.tar 985 0.3 GB 985\n",
"# writing publaynet-train-000002.tar 988 0.3 GB 1973\n",
"# writing publaynet-train-000003.tar 979 0.3 GB 2952\n",
"# writing publaynet-train-000004.tar 995 0.3 GB 3947\n",
"# writing publaynet-train-000005.tar 976 0.3 GB 4923\n",
"# writing publaynet-train-000006.tar 1001 0.3 GB 5924\n",
"# writing publaynet-train-000007.tar 989 0.3 GB 6913\n",
"# writing publaynet-train-000008.tar 977 0.3 GB 7890\n",
"# writing publaynet-train-000009.tar 967 0.3 GB 8857\n",
"# writing publaynet-train-000010.tar 988 0.3 GB 9845\n",
"# writing publaynet-train-000011.tar 1003 0.3 GB 10848\n",
"# writing publaynet-train-000012.tar 985 0.3 GB 11833\n",
"# writing publaynet-train-000013.tar 973 0.3 GB 12806\n",
"# writing publaynet-train-000014.tar 981 0.3 GB 13787\n",
"# writing publaynet-train-000015.tar 981 0.3 GB 14768\n",
"# writing publaynet-train-000016.tar 989 0.3 GB 15757\n",
"# writing publaynet-train-000017.tar 989 0.3 GB 16746\n",
"# writing publaynet-train-000018.tar 1000 0.3 GB 17746\n",
"# writing publaynet-train-000019.tar 981 0.3 GB 18727\n",
"# writing publaynet-train-000020.tar 1008 0.3 GB 19735\n",
"# writing publaynet-train-000021.tar 979 0.3 GB 20714\n",
"# writing publaynet-train-000022.tar 982 0.3 GB 21696\n",
"# writing publaynet-train-000023.tar 994 0.3 GB 22690\n",
"# writing publaynet-train-000024.tar 989 0.3 GB 23679\n",
"# writing publaynet-train-000025.tar 985 0.3 GB 24664\n",
"# writing publaynet-train-000026.tar 991 0.3 GB 25655\n",
"# writing publaynet-train-000027.tar 989 0.3 GB 26644\n",
"# writing publaynet-train-000028.tar 985 0.3 GB 27629\n",
"# writing publaynet-train-000029.tar 1002 0.3 GB 28631\n",
"# writing publaynet-train-000030.tar 976 0.3 GB 29607\n",
"# writing publaynet-train-000031.tar 989 0.3 GB 30596\n",
"# writing publaynet-train-000032.tar 994 0.3 GB 31590\n",
"# writing publaynet-train-000033.tar 988 0.3 GB 32578\n",
"# writing publaynet-train-000034.tar 979 0.3 GB 33557\n",
"# writing publaynet-train-000035.tar 974 0.3 GB 34531\n",
"# writing publaynet-train-000036.tar 992 0.3 GB 35523\n",
"# writing publaynet-train-000037.tar 981 0.3 GB 36504\n",
"# writing publaynet-train-000038.tar 986 0.3 GB 37490\n",
"# writing publaynet-train-000039.tar 988 0.3 GB 38478\n",
"# writing publaynet-train-000040.tar 981 0.3 GB 39459\n",
"# writing publaynet-train-000041.tar 988 0.3 GB 40447\n",
"# writing publaynet-train-000042.tar 1000 0.3 GB 41447\n",
"# writing publaynet-train-000043.tar 982 0.3 GB 42429\n",
"# writing publaynet-train-000044.tar 980 0.3 GB 43409\n",
"# writing publaynet-train-000045.tar 985 0.3 GB 44394\n",
"# writing publaynet-train-000046.tar 980 0.3 GB 45374\n",
"# writing publaynet-train-000047.tar 976 0.3 GB 46350\n",
"# writing publaynet-train-000048.tar 985 0.3 GB 47335\n",
"# writing publaynet-train-000049.tar 991 0.3 GB 48326\n",
"# writing publaynet-train-000050.tar 1001 0.3 GB 49327\n",
"# writing publaynet-train-000051.tar 978 0.3 GB 50305\n",
"# writing publaynet-train-000052.tar 962 0.3 GB 51267\n",
"# writing publaynet-train-000053.tar 998 0.3 GB 52265\n",
"# writing publaynet-train-000054.tar 983 0.3 GB 53248\n",
"# writing publaynet-train-000055.tar 1000 0.3 GB 54248\n",
"# writing publaynet-train-000056.tar 979 0.3 GB 55227\n",
"# writing publaynet-train-000057.tar 987 0.3 GB 56214\n",
"# writing publaynet-train-000058.tar 974 0.3 GB 57188\n",
"# writing publaynet-train-000059.tar 978 0.3 GB 58166\n",
"# writing publaynet-train-000060.tar 980 0.3 GB 59146\n",
"# writing publaynet-train-000061.tar 1007 0.3 GB 60153\n",
"# writing publaynet-train-000062.tar 990 0.3 GB 61143\n",
"# writing publaynet-train-000063.tar 993 0.3 GB 62136\n",
"# writing publaynet-train-000064.tar 983 0.3 GB 63119\n",
"# writing publaynet-train-000065.tar 981 0.3 GB 64100\n",
"# writing publaynet-train-000066.tar 974 0.3 GB 65074\n",
"# writing publaynet-train-000067.tar 981 0.3 GB 66055\n",
"# writing publaynet-train-000068.tar 986 0.3 GB 67041\n",
"# writing publaynet-train-000069.tar 994 0.3 GB 68035\n",
"# writing publaynet-train-000070.tar 970 0.3 GB 69005\n",
"# writing publaynet-train-000071.tar 980 0.3 GB 69985\n",
"# writing publaynet-train-000072.tar 984 0.3 GB 70969\n",
"# writing publaynet-train-000073.tar 989 0.3 GB 71958\n",
"# writing publaynet-train-000074.tar 986 0.3 GB 72944\n",
"# writing publaynet-train-000075.tar 974 0.3 GB 73918\n",
"# writing publaynet-train-000076.tar 997 0.3 GB 74915\n",
"# writing publaynet-train-000077.tar 981 0.3 GB 75896\n",
"# writing publaynet-train-000078.tar 977 0.3 GB 76873\n",
"# writing publaynet-train-000079.tar 995 0.3 GB 77868\n",
"# writing publaynet-train-000080.tar 983 0.3 GB 78851\n",
"# writing publaynet-train-000081.tar 981 0.3 GB 79832\n",
"# writing publaynet-train-000082.tar 985 0.3 GB 80817\n",
"# writing publaynet-train-000083.tar 985 0.3 GB 81802\n",
"# writing publaynet-train-000084.tar 993 0.3 GB 82795\n",
"# writing publaynet-train-000085.tar 988 0.3 GB 83783\n",
"# writing publaynet-train-000086.tar 994 0.3 GB 84777\n",
"# writing publaynet-train-000087.tar 983 0.3 GB 85760\n",
"# writing publaynet-train-000088.tar 982 0.3 GB 86742\n",
"# writing publaynet-train-000089.tar 988 0.3 GB 87730\n",
"# writing publaynet-train-000090.tar 986 0.3 GB 88716\n",
"# writing publaynet-train-000091.tar 983 0.3 GB 89699\n",
"# writing publaynet-train-000092.tar 987 0.3 GB 90686\n",
"# writing publaynet-train-000093.tar 980 0.3 GB 91666\n",
"# writing publaynet-train-000094.tar 976 0.3 GB 92642\n",
"# writing publaynet-train-000095.tar 987 0.3 GB 93629\n",
"# writing publaynet-train-000096.tar 992 0.3 GB 94621\n",
"# writing publaynet-train-000097.tar 978 0.3 GB 95599\n",
"# writing publaynet-train-000098.tar 989 0.3 GB 96588\n",
"# writing publaynet-train-000099.tar 970 0.3 GB 97558\n",
"# writing publaynet-train-000100.tar 992 0.3 GB 98550\n",
"# writing publaynet-train-000101.tar 998 0.3 GB 99548\n",
"# writing publaynet-train-000102.tar 986 0.3 GB 100534\n",
"# writing publaynet-train-000103.tar 993 0.3 GB 101527\n",
"# writing publaynet-train-000104.tar 974 0.3 GB 102501\n",
"# writing publaynet-train-000105.tar 971 0.3 GB 103472\n",
"# writing publaynet-train-000106.tar 982 0.3 GB 104454\n",
"# writing publaynet-train-000107.tar 985 0.3 GB 105439\n",
"# writing publaynet-train-000108.tar 982 0.3 GB 106421\n",
"# writing publaynet-train-000109.tar 983 0.3 GB 107404\n",
"# writing publaynet-train-000110.tar 998 0.3 GB 108402\n",
"# writing publaynet-train-000111.tar 981 0.3 GB 109383\n",
"# writing publaynet-train-000112.tar 980 0.3 GB 110363\n",
"# writing publaynet-train-000113.tar 984 0.3 GB 111347\n",
"# writing publaynet-train-000114.tar 976 0.3 GB 112323\n",
"# writing publaynet-train-000115.tar 971 0.3 GB 113294\n",
"# writing publaynet-train-000116.tar 993 0.3 GB 114287\n",
"# writing publaynet-train-000117.tar 988 0.3 GB 115275\n",
"# writing publaynet-train-000118.tar 988 0.3 GB 116263\n",
"# writing publaynet-train-000119.tar 977 0.3 GB 117240\n",
"# writing publaynet-train-000120.tar 994 0.3 GB 118234\n",
"# writing publaynet-train-000121.tar 984 0.3 GB 119218\n",
"# writing publaynet-train-000122.tar 985 0.3 GB 120203\n",
"# writing publaynet-train-000123.tar 990 0.3 GB 121193\n",
"# writing publaynet-train-000124.tar 977 0.3 GB 122170\n",
"# writing publaynet-train-000125.tar 1003 0.3 GB 123173\n",
"# writing publaynet-train-000126.tar 992 0.3 GB 124165\n",
"# writing publaynet-train-000127.tar 1005 0.3 GB 125170\n",
"# writing publaynet-train-000128.tar 990 0.3 GB 126160\n",
"# writing publaynet-train-000129.tar 968 0.3 GB 127128\n",
"# writing publaynet-train-000130.tar 1002 0.3 GB 128130\n",
"# writing publaynet-train-000131.tar 985 0.3 GB 129115\n",
"# writing publaynet-train-000132.tar 983 0.3 GB 130098\n",
"# writing publaynet-train-000133.tar 994 0.3 GB 131092\n",
"# writing publaynet-train-000134.tar 989 0.3 GB 132081\n",
"# writing publaynet-train-000135.tar 996 0.3 GB 133077\n",
"# writing publaynet-train-000136.tar 990 0.3 GB 134067\n",
"# writing publaynet-train-000137.tar 982 0.3 GB 135049\n",
"# writing publaynet-train-000138.tar 975 0.3 GB 136024\n",
"# writing publaynet-train-000139.tar 993 0.3 GB 137017\n",
"# writing publaynet-train-000140.tar 986 0.3 GB 138003\n",
"# writing publaynet-train-000141.tar 990 0.3 GB 138993\n",
"# writing publaynet-train-000142.tar 995 0.3 GB 139988\n",
"# writing publaynet-train-000143.tar 976 0.3 GB 140964\n",
"# writing publaynet-train-000144.tar 1009 0.3 GB 141973\n",
"# writing publaynet-train-000145.tar 981 0.3 GB 142954\n",
"# writing publaynet-train-000146.tar 990 0.3 GB 143944\n",
"# writing publaynet-train-000147.tar 984 0.3 GB 144928\n",
"# writing publaynet-train-000148.tar 981 0.3 GB 145909\n",
"# writing publaynet-train-000149.tar 982 0.3 GB 146891\n",
"# writing publaynet-train-000150.tar 979 0.3 GB 147870\n",
"# writing publaynet-train-000151.tar 991 0.3 GB 148861\n",
"# writing publaynet-train-000152.tar 972 0.3 GB 149833\n",
"# writing publaynet-train-000153.tar 992 0.3 GB 150825\n",
"# writing publaynet-train-000154.tar 998 0.3 GB 151823\n",
"# writing publaynet-train-000155.tar 989 0.3 GB 152812\n",
"# writing publaynet-train-000156.tar 987 0.3 GB 153799\n",
"# writing publaynet-train-000157.tar 962 0.3 GB 154761\n",
"# writing publaynet-train-000158.tar 1009 0.3 GB 155770\n",
"# writing publaynet-train-000159.tar 977 0.3 GB 156747\n",
"# writing publaynet-train-000160.tar 993 0.3 GB 157740\n",
"# writing publaynet-train-000161.tar 993 0.3 GB 158733\n",
"# writing publaynet-train-000162.tar 988 0.3 GB 159721\n",
"# writing publaynet-train-000163.tar 982 0.3 GB 160703\n",
"# writing publaynet-train-000164.tar 985 0.3 GB 161688\n",
"# writing publaynet-train-000165.tar 978 0.3 GB 162666\n",
"# writing publaynet-train-000166.tar 992 0.3 GB 163658\n",
"# writing publaynet-train-000167.tar 985 0.3 GB 164643\n",
"# writing publaynet-train-000168.tar 970 0.3 GB 165613\n",
"# writing publaynet-train-000169.tar 998 0.3 GB 166611\n",
"# writing publaynet-train-000170.tar 977 0.3 GB 167588\n",
"# writing publaynet-train-000171.tar 992 0.3 GB 168580\n",
"# writing publaynet-train-000172.tar 991 0.3 GB 169571\n",
"# writing publaynet-train-000173.tar 981 0.3 GB 170552\n",
"# writing publaynet-train-000174.tar 990 0.3 GB 171542\n",
"# writing publaynet-train-000175.tar 988 0.3 GB 172530\n",
"# writing publaynet-train-000176.tar 977 0.3 GB 173507\n",
"# writing publaynet-train-000177.tar 981 0.3 GB 174488\n",
"# writing publaynet-train-000178.tar 991 0.3 GB 175479\n",
"# writing publaynet-train-000179.tar 990 0.3 GB 176469\n",
"# writing publaynet-train-000180.tar 987 0.3 GB 177456\n",
"# writing publaynet-train-000181.tar 998 0.3 GB 178454\n",
"# writing publaynet-train-000182.tar 991 0.3 GB 179445\n",
"# writing publaynet-train-000183.tar 985 0.3 GB 180430\n",
"# writing publaynet-train-000184.tar 977 0.3 GB 181407\n",
"# writing publaynet-train-000185.tar 992 0.3 GB 182399\n",
"# writing publaynet-train-000186.tar 988 0.3 GB 183387\n",
"# writing publaynet-train-000187.tar 983 0.3 GB 184370\n",
"# writing publaynet-train-000188.tar 976 0.3 GB 185346\n",
"# writing publaynet-train-000189.tar 975 0.3 GB 186321\n",
"# writing publaynet-train-000190.tar 977 0.3 GB 187298\n",
"# writing publaynet-train-000191.tar 982 0.3 GB 188280\n",
"# writing publaynet-train-000192.tar 991 0.3 GB 189271\n",
"# writing publaynet-train-000193.tar 987 0.3 GB 190258\n",
"# writing publaynet-train-000194.tar 980 0.3 GB 191238\n",
"# writing publaynet-train-000195.tar 973 0.3 GB 192211\n",
"# writing publaynet-train-000196.tar 991 0.3 GB 193202\n",
"# writing publaynet-train-000197.tar 976 0.3 GB 194178\n",
"# writing publaynet-train-000198.tar 989 0.3 GB 195167\n",
"# writing publaynet-train-000199.tar 984 0.3 GB 196151\n",
"# writing publaynet-train-000200.tar 993 0.3 GB 197144\n",
"# writing publaynet-train-000201.tar 983 0.3 GB 198127\n",
"# writing publaynet-train-000202.tar 972 0.3 GB 199099\n",
"# writing publaynet-train-000203.tar 990 0.3 GB 200089\n",
"# writing publaynet-train-000204.tar 989 0.3 GB 201078\n",
"# writing publaynet-train-000205.tar 999 0.3 GB 202077\n",
"# writing publaynet-train-000206.tar 981 0.3 GB 203058\n",
"# writing publaynet-train-000207.tar 994 0.3 GB 204052\n",
"# writing publaynet-train-000208.tar 983 0.3 GB 205035\n",
"# writing publaynet-train-000209.tar 981 0.3 GB 206016\n",
"# writing publaynet-train-000210.tar 963 0.3 GB 206979\n",
"# writing publaynet-train-000211.tar 984 0.3 GB 207963\n",
"# writing publaynet-train-000212.tar 974 0.3 GB 208937\n",
"# writing publaynet-train-000213.tar 993 0.3 GB 209930\n",
"# writing publaynet-train-000214.tar 989 0.3 GB 210919\n",
"# writing publaynet-train-000215.tar 983 0.3 GB 211902\n",
"# writing publaynet-train-000216.tar 969 0.3 GB 212871\n",
"# writing publaynet-train-000217.tar 974 0.3 GB 213845\n",
"# writing publaynet-train-000218.tar 976 0.3 GB 214821\n",
"# writing publaynet-train-000219.tar 982 0.3 GB 215803\n",
"# writing publaynet-train-000220.tar 975 0.3 GB 216778\n",
"# writing publaynet-train-000221.tar 990 0.3 GB 217768\n",
"# writing publaynet-train-000222.tar 993 0.3 GB 218761\n",
"# writing publaynet-train-000223.tar 981 0.3 GB 219742\n",
"# writing publaynet-train-000224.tar 977 0.3 GB 220719\n",
"# writing publaynet-train-000225.tar 986 0.3 GB 221705\n",
"# writing publaynet-train-000226.tar 1000 0.3 GB 222705\n",
"# writing publaynet-train-000227.tar 984 0.3 GB 223689\n",
"# writing publaynet-train-000228.tar 986 0.3 GB 224675\n",
"# writing publaynet-train-000229.tar 983 0.3 GB 225658\n",
"# writing publaynet-train-000230.tar 977 0.3 GB 226635\n",
"# writing publaynet-train-000231.tar 980 0.3 GB 227615\n",
"# writing publaynet-train-000232.tar 990 0.3 GB 228605\n",
"# writing publaynet-train-000233.tar 990 0.3 GB 229595\n",
"# writing publaynet-train-000234.tar 985 0.3 GB 230580\n",
"# writing publaynet-train-000235.tar 992 0.3 GB 231572\n",
"# writing publaynet-train-000236.tar 997 0.3 GB 232569\n",
"# writing publaynet-train-000237.tar 988 0.3 GB 233557\n",
"# writing publaynet-train-000238.tar 990 0.3 GB 234547\n",
"# writing publaynet-train-000239.tar 1001 0.3 GB 235548\n",
"# writing publaynet-train-000240.tar 960 0.3 GB 236508\n",
"# writing publaynet-train-000241.tar 987 0.3 GB 237495\n",
"# writing publaynet-train-000242.tar 982 0.3 GB 238477\n",
"# writing publaynet-train-000243.tar 985 0.3 GB 239462\n",
"# writing publaynet-train-000244.tar 977 0.3 GB 240439\n",
"# writing publaynet-train-000245.tar 996 0.3 GB 241435\n",
"# writing publaynet-train-000246.tar 980 0.3 GB 242415\n",
"# writing publaynet-train-000247.tar 1001 0.3 GB 243416\n",
"# writing publaynet-train-000248.tar 999 0.3 GB 244415\n",
"# writing publaynet-train-000249.tar 992 0.3 GB 245407\n",
"# writing publaynet-train-000250.tar 1007 0.3 GB 246414\n",
"# writing publaynet-train-000251.tar 997 0.3 GB 247411\n",
"# writing publaynet-train-000252.tar 1002 0.3 GB 248413\n",
"# writing publaynet-train-000253.tar 993 0.3 GB 249406\n",
"# writing publaynet-train-000254.tar 977 0.3 GB 250383\n",
"# writing publaynet-train-000255.tar 983 0.3 GB 251366\n",
"# writing publaynet-train-000256.tar 977 0.3 GB 252343\n",
"# writing publaynet-train-000257.tar 1000 0.3 GB 253343\n",
"# writing publaynet-train-000258.tar 1005 0.3 GB 254348\n",
"# writing publaynet-train-000259.tar 981 0.3 GB 255329\n",
"# writing publaynet-train-000260.tar 984 0.3 GB 256313\n",
"# writing publaynet-train-000261.tar 969 0.3 GB 257282\n",
"# writing publaynet-train-000262.tar 974 0.3 GB 258256\n",
"# writing publaynet-train-000263.tar 990 0.3 GB 259246\n",
"# writing publaynet-train-000264.tar 994 0.3 GB 260240\n",
"# writing publaynet-train-000265.tar 973 0.3 GB 261213\n",
"# writing publaynet-train-000266.tar 999 0.3 GB 262212\n",
"# writing publaynet-train-000267.tar 995 0.3 GB 263207\n",
"# writing publaynet-train-000268.tar 995 0.3 GB 264202\n",
"# writing publaynet-train-000269.tar 981 0.3 GB 265183\n",
"# writing publaynet-train-000270.tar 1002 0.3 GB 266185\n",
"# writing publaynet-train-000271.tar 977 0.3 GB 267162\n",
"# writing publaynet-train-000272.tar 968 0.3 GB 268130\n",
"# writing publaynet-train-000273.tar 991 0.3 GB 269121\n",
"# writing publaynet-train-000274.tar 988 0.3 GB 270109\n",
"# writing publaynet-train-000275.tar 984 0.3 GB 271093\n",
"# writing publaynet-train-000276.tar 995 0.3 GB 272088\n",
"# writing publaynet-train-000277.tar 989 0.3 GB 273077\n",
"# writing publaynet-train-000278.tar 997 0.3 GB 274074\n",
"# writing publaynet-train-000279.tar 986 0.3 GB 275060\n",
"# writing publaynet-train-000280.tar 983 0.3 GB 276043\n",
"# writing publaynet-train-000281.tar 986 0.3 GB 277029\n",
"# writing publaynet-train-000282.tar 978 0.3 GB 278007\n",
"# writing publaynet-train-000283.tar 982 0.3 GB 278989\n",
"# writing publaynet-train-000284.tar 979 0.3 GB 279968\n",
"# writing publaynet-train-000285.tar 984 0.3 GB 280952\n",
"# writing publaynet-train-000286.tar 979 0.3 GB 281931\n",
"# writing publaynet-train-000287.tar 997 0.3 GB 282928\n",
"# writing publaynet-train-000288.tar 992 0.3 GB 283920\n",
"# writing publaynet-train-000289.tar 1004 0.3 GB 284924\n",
"# writing publaynet-train-000290.tar 988 0.3 GB 285912\n",
"# writing publaynet-train-000291.tar 994 0.3 GB 286906\n",
"# writing publaynet-train-000292.tar 980 0.3 GB 287886\n",
"# writing publaynet-train-000293.tar 998 0.3 GB 288884\n",
"# writing publaynet-train-000294.tar 1002 0.3 GB 289886\n",
"# writing publaynet-train-000295.tar 992 0.3 GB 290878\n",
"# writing publaynet-train-000296.tar 989 0.3 GB 291867\n",
"# writing publaynet-train-000297.tar 993 0.3 GB 292860\n",
"# writing publaynet-train-000298.tar 981 0.3 GB 293841\n",
"# writing publaynet-train-000299.tar 999 0.3 GB 294840\n",
"# writing publaynet-train-000300.tar 990 0.3 GB 295830\n",
"# writing publaynet-train-000301.tar 974 0.3 GB 296804\n",
"# writing publaynet-train-000302.tar 989 0.3 GB 297793\n",
"# writing publaynet-train-000303.tar 978 0.3 GB 298771\n",
"# writing publaynet-train-000304.tar 992 0.3 GB 299763\n",
"# writing publaynet-train-000305.tar 989 0.3 GB 300752\n",
"# writing publaynet-train-000306.tar 990 0.3 GB 301742\n",
"# writing publaynet-train-000307.tar 981 0.3 GB 302723\n",
"# writing publaynet-train-000308.tar 998 0.3 GB 303721\n",
"# writing publaynet-train-000309.tar 989 0.3 GB 304710\n",
"# writing publaynet-train-000310.tar 985 0.3 GB 305695\n",
"# writing publaynet-train-000311.tar 974 0.3 GB 306669\n",
"# writing publaynet-train-000312.tar 997 0.3 GB 307666\n",
"# writing publaynet-train-000313.tar 974 0.3 GB 308640\n",
"# writing publaynet-train-000314.tar 968 0.3 GB 309608\n",
"# writing publaynet-train-000315.tar 1008 0.3 GB 310616\n",
"# writing publaynet-train-000316.tar 998 0.3 GB 311614\n",
"# writing publaynet-train-000317.tar 992 0.3 GB 312606\n",
"# writing publaynet-train-000318.tar 980 0.3 GB 313586\n",
"# writing publaynet-train-000319.tar 987 0.3 GB 314573\n",
"# writing publaynet-train-000320.tar 978 0.3 GB 315551\n",
"# writing publaynet-train-000321.tar 994 0.3 GB 316545\n",
"# writing publaynet-train-000322.tar 985 0.3 GB 317530\n",
"# writing publaynet-train-000323.tar 984 0.3 GB 318514\n",
"# writing publaynet-train-000324.tar 989 0.3 GB 319503\n",
"# writing publaynet-train-000325.tar 977 0.3 GB 320480\n",
"# writing publaynet-train-000326.tar 1000 0.3 GB 321480\n",
"# writing publaynet-train-000327.tar 988 0.3 GB 322468\n",
"# writing publaynet-train-000328.tar 993 0.3 GB 323461\n",
"# writing publaynet-train-000329.tar 994 0.3 GB 324455\n",
"# writing publaynet-train-000330.tar 979 0.3 GB 325434\n",
"# writing publaynet-train-000331.tar 994 0.3 GB 326428\n",
"# writing publaynet-train-000332.tar 1004 0.3 GB 327432\n",
"# writing publaynet-train-000333.tar 988 0.3 GB 328420\n",
"# writing publaynet-train-000334.tar 980 0.3 GB 329400\n",
"# writing publaynet-train-000335.tar 990 0.3 GB 330390\n",
"# writing publaynet-train-000336.tar 988 0.3 GB 331378\n",
"# writing publaynet-train-000337.tar 968 0.3 GB 332346\n",
"# writing publaynet-train-000338.tar 987 0.3 GB 333333\n",
"# writing publaynet-train-000339.tar 985 0.3 GB 334318\n",
"# writing publaynet-train-000340.tar 977 0.3 GB 335295\n"
]
}
],
"source": [
"sink = wds.ShardWriter(\"publaynet-train-%06d.tar\", maxsize=3e8)\n",
"images = glob.glob(\"train/*\")\n",
"random.shuffle(images)\n",
"for path in images:\n",
" base = os.path.basename(path)\n",
" root = os.path.splitext(base)[0]\n",
" with open(path, \"rb\") as stream:\n",
" png = stream.read()\n",
" json = meta[base]\n",
" sample = dict(__key__=root, png=png, json=json)\n",
" sink.write(sample)\n",
"sink.close()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "respective-reality",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# writing publaynet-val-000000.tar 0 0.0 GB 0\n",
"# writing publaynet-val-000001.tar 967 0.3 GB 967\n",
"# writing publaynet-val-000002.tar 966 0.3 GB 1933\n",
"# writing publaynet-val-000003.tar 973 0.3 GB 2906\n",
"# writing publaynet-val-000004.tar 974 0.3 GB 3880\n",
"# writing publaynet-val-000005.tar 966 0.3 GB 4846\n",
"# writing publaynet-val-000006.tar 971 0.3 GB 5817\n",
"# writing publaynet-val-000007.tar 970 0.3 GB 6787\n",
"# writing publaynet-val-000008.tar 960 0.3 GB 7747\n",
"# writing publaynet-val-000009.tar 964 0.3 GB 8711\n",
"# writing publaynet-val-000010.tar 972 0.3 GB 9683\n",
"# writing publaynet-val-000011.tar 953 0.3 GB 10636\n"
]
}
],
"source": [
"import json\n",
"\n",
"meta = load_meta(\"val.json\")\n",
"\n",
"sink = wds.ShardWriter(\"publaynet-val-%06d.tar\", maxsize=3e8)\n",
"images = glob.glob(\"val/*\")\n",
"random.shuffle(images)\n",
"for path in images:\n",
" base = os.path.basename(path)\n",
" root = os.path.splitext(base)[0]\n",
" with open(path, \"rb\") as stream:\n",
" png = stream.read()\n",
" info = meta[base]\n",
" sample = dict(__key__=root, png=png, json=info)\n",
" sink.write(sample)\n",
"sink.close()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "thermal-produce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# writing publaynet-test-000000.tar 0 0.0 GB 0\n",
"# writing publaynet-test-000001.tar 1014 0.3 GB 1014\n",
"# writing publaynet-test-000002.tar 1013 0.3 GB 2027\n",
"# writing publaynet-test-000003.tar 1009 0.3 GB 3036\n",
"# writing publaynet-test-000004.tar 1008 0.3 GB 4044\n",
"# writing publaynet-test-000005.tar 1023 0.3 GB 5067\n",
"# writing publaynet-test-000006.tar 1006 0.3 GB 6073\n",
"# writing publaynet-test-000007.tar 1010 0.3 GB 7083\n",
"# writing publaynet-test-000008.tar 1008 0.3 GB 8091\n",
"# writing publaynet-test-000009.tar 992 0.3 GB 9083\n",
"# writing publaynet-test-000010.tar 1000 0.3 GB 10083\n",
"# writing publaynet-test-000011.tar 1006 0.3 GB 11089\n"
]
}
],
"source": [
"sink = wds.ShardWriter(\"publaynet-test-%06d.tar\", maxsize=3e8)\n",
"images = glob.glob(\"test/*\")\n",
"random.shuffle(images)\n",
"for path in images:\n",
" base = os.path.basename(path)\n",
" root = os.path.splitext(base)[0]\n",
" with open(path, \"rb\") as stream:\n",
" png = stream.read()\n",
" #json = db[base]\n",
" sample = dict(__key__=root, png=png)\n",
" sink.write(sample)\n",
"sink.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "binary-creation",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment