Skip to content

Instantly share code, notes, and snippets.

@km4sh
Last active August 14, 2021 00:39
Show Gist options
  • Save km4sh/3dd9d45ae163e9966ac295c56661a4df to your computer and use it in GitHub Desktop.
Save km4sh/3dd9d45ae163e9966ac295c56661a4df to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate AGE Label for Vox-Celeb Dataset (Out-of-date)"
]
},
{
"cell_type": "markdown",
"metadata": {
"cellView": "form",
"id": "SYB8rOXt_WRf"
},
"source": [
"## Install dependencies via `pip`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "SYB8rOXt_WRf"
},
"outputs": [],
"source": [
"!pip install wptools\n",
"!pip install datetime\n",
"!pip install pafy\n",
"!pip install --upgrade youtube_dl"
]
},
{
"cell_type": "markdown",
"metadata": {
"cellView": "form",
"id": "4sPO8Ao_LuCG"
},
"source": [
"## Download `vox-celeb` meta file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4sPO8Ao_LuCG"
},
"outputs": [],
"source": [
"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_test_txt.zip\n",
"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_dev_txt.zip\n",
"!zipinfo -1 vox2_dev_txt.zip '*/' > dev.txt\n",
"!zipinfo -1 vox2_test_txt.zip '*/' > test.txt"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bWcVgXzMCr_F"
},
"source": [
"## Parse birth date from wikipedia"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0iHwet85dxjU"
},
"outputs": [],
"source": [
"#@title Parse the celebs' age from wikipedia\n",
"import csv\n",
"import wptools as wp\n",
"from datetime import datetime as dt\n",
"\n",
"meta = open('data.csv', 'r+')\n",
"reader = csv.reader(meta)\n",
"results = open('/content/drive/My Drive/VoxCeleb_Age_Wiki.csv', 'w')\n",
"writer = csv.writer(results)\n",
"a = 0\n",
"for row in reader:\n",
" if a < 5660:\n",
" a = a + 1\n",
" continue;\n",
" else:\n",
" print('success!')\n",
" print(row[4])\n",
" page = wp.page(row[4])\n",
" page.wanted_labels(['P569', 'P21', 'P31'])\n",
" try:\n",
" page.get_wikidata()\n",
" orig = page.data['wikidata']['date of birth (P569)']\n",
" today = dt.today()\n",
" if type(orig) is list:\n",
" birth = dt.strptime(orig[0], '+%Y-%m-%dT%H:%M:%SZ')\n",
" else:\n",
" birth = dt.strptime(orig, '+%Y-%m-%dT%H:%M:%SZ')\n",
" days = today - birth\n",
" age = int(days.days / 365)\n",
" row[5] = str(age)\n",
" writer.writerow(row)\n",
" print(row)\n",
" except:\n",
" writer.writerow(row)\n",
" continue"
]
},
{
"cell_type": "markdown",
"metadata": {
"cellView": "form",
"id": "qIdQMsF6_s-I"
},
"source": [
"## Download free proxy list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "qIdQMsF6_s-I"
},
"outputs": [],
"source": [
"!curl -sSf \"https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt\" > proxy-list.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nQMzxDwuFdLq"
},
"outputs": [],
"source": [
"meta = open('/content/drive/My Drive/datasets/dev.txt', 'r')\n",
"line = meta.readline()\n",
"count = 0\n",
"batch = 1\n",
"metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
"while line:\n",
" line = meta.readline()\n",
" if len(line) == 25:\n",
" metaOut.write(line)\n",
" count = count + 1\n",
" if count >= 500:\n",
" print('五百个啦!还有好多!', batch)\n",
" count = 0\n",
" batch = batch + 1\n",
" metaOut.close()\n",
" metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
"metaOut.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parsing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 368
},
"id": "vFySMm3HRZNJ",
"outputId": "56880f3d-be74-41ac-e56c-daf074d39809"
},
"outputs": [],
"source": [
"import os\n",
"import csv\n",
"import youtube_dl\n",
"import time\n",
"import datetime\n",
"import logging\n",
"logger = logging.getLogger(__name__)\n",
"f_handler = logging.FileHandler('/content/drive/My Drive/outputnov12.txt')\n",
"f_handler.setLevel(logging.DEBUG)\n",
"logger.addHandler(f_handler)\n",
"proxyList = open('proxy-list.txt', 'r')\n",
"proxy = proxyList.readline()\n",
"print('NEW PROXY:', 'socks5://'+proxy)\n",
"ydl_opts = {\n",
" 'outtmpl': '%(upload_date)s',\n",
" 'skip_download': True,\n",
" 'simulate': True,\n",
" 'slient': True,\n",
" 'logger': logger,\n",
" 'proxy': 'socks5://'+proxy,\n",
" 'socket_timeout': 5\n",
"}\n",
"ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
"errBuf = 0\n",
"csvOut = open('/content/drive/My Drive/datasets/100p_upload_date.csv', 'a')\n",
"wrongOut = open('/content/drive/My Drive/datasets/wrongOut.txt', 'a')\n",
"writer = csv.writer(csvOut)\n",
"count = 0\n",
"for metaBatch in range(100, 291):\n",
" with open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(metaBatch)+'.txt', 'r') as fp:\n",
" csvOut.flush()\n",
" wrongOut.flush()\n",
" print('metaBatch:', metaBatch, 'opened.')\n",
" line = fp.readline()\n",
" while line:\n",
" if len(line) == 25:\n",
" head, tail = os.path.split(line)\n",
" head, ytid = os.path.split(head)\n",
" head, uuid = os.path.split(head)\n",
" try:\n",
" csvOut.flush()\n",
" wrongOut.flush()\n",
" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
" current = [uuid, ytid, info['upload_date'], 'dev']\n",
" writer.writerow(current)\n",
" errBuf = 0\n",
" print(datetime.datetime.now(), uuid, ytid, info['upload_date'], 'dev', metaBatch, count)\n",
" except:\n",
" current = [uuid, ytid, '#REFUSED', 'dev', metaBatch]\n",
" wrongOut.writelines(line)\n",
" errBuf = errBuf + 1\n",
" print(datetime.datetime.now(), uuid, ytid, '#REFUSED', 'dev', metaBatch, count)\n",
" line = fp.readline()\n",
" count = count + 1\n",
" \n",
" if errBuf >= 20 :\n",
" csvOut.flush()\n",
" wrongOut.flush()\n",
" count = 0\n",
" errBuf = 0\n",
" try:\n",
" proxy = proxyList.readline()\n",
" ydl_opts['proxy'] = 'socks5://'+proxy\n",
" print('NEW PROXY:', 'socks5://'+proxy)\n",
" ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
" except:\n",
" pass\n",
" print('metaBatch:', metaBatch, 'Finished.')\n",
" \n",
"csvOut = open('/content/drive/My Drive/datasets/testOut.csv', 'w')\n",
"writer = csv.writer(csvOut)\n",
"with open('/content/drive/My Drive/datasets/test.txt', 'r') as fp:\n",
" line = fp.readline()\n",
" while line:\n",
" if len(line) == 25:\n",
" head, tail = os.path.split(line)\n",
" head, ytid = os.path.split(head)\n",
" head, uuid = os.path.split(head)\n",
" try:\n",
" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
" current = [uuid, ytid, info['upload_date'], 'dev']\n",
" writer.writerow(current)\n",
" except:\n",
" current = [uuid, ytid, 'TIMEDOUT', 'test']\n",
" print(current)\n",
" line = fp.readline()\n",
" fp.close()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "Generate AGE label for Vox-Celeb dataset.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment