Created
March 23, 2020 18:14
-
-
Save tmbdev/7bd2e0f92d659f44d68925c2d14a40d4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import subprocess\n", | |
"import tarfile\n", | |
"import braceexpand" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"NB: Change these URLs to a copy of the dataset for training; please don't train against our storage bucket." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"url = \"http://storage.googleapis.com/lpr-yt8m-hi-sharded/yt8m-hi-000000.tar\"\n", | |
"shards = \"http://storage.googleapis.com/lpr-yt8m-hi-sharded/yt8m-hi-{000000..000999}.tar\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def iterate_over_video_streams(url):\n", | |
" with subprocess.Popen([\"curl\", \"-s\", \"-L\", url], stdout=subprocess.PIPE) as proc:\n", | |
" with tarfile.TarFile.open(fileobj=proc.stdout, mode=\"r|\") as source:\n", | |
" for info in source:\n", | |
" if not info.name.endswith(\".mp4\"): continue\n", | |
" yield info, source.extractfile(info)\n", | |
" \n", | |
"def write_stream(stream, fname):\n", | |
" with open(fname, \"wb\") as ostream:\n", | |
" while True:\n", | |
" data = stream.read(1000000)\n", | |
" if len(data) == 0: break\n", | |
" ostream.write(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"31521996 -3MxAiAnknY.mp4\n" | |
] | |
} | |
], | |
"source": [ | |
"for info, stream in iterate_over_video_streams(url):\n", | |
" print(info.size, info.name)\n", | |
" write_stream(stream, \"sample.mp4\")\n", | |
" # at this point, you can open and use \"sample.mp4\" just the way you usually process videos\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 tmb tmb 31521996 Mar 23 18:09 sample.mp4\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -l sample.mp4\n", | |
"# mplayer sample.mp4\n", | |
"# etc." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"To iterate over the entire dataset, use something like this:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for url in braceexpand.braceexpand(shards):\n", | |
" for info, stream in iterate_over_video_streams(url):\n", | |
" break\n", | |
" break" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment