Last active
August 14, 2021 00:39
-
-
Save km4sh/3dd9d45ae163e9966ac295c56661a4df to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Generate AGE Label for Vox-Celeb Dataset (Out-of-date)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"cellView": "form", | |
"id": "SYB8rOXt_WRf" | |
}, | |
"source": [ | |
"## Install dependencies via `pip`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"cellView": "form", | |
"id": "SYB8rOXt_WRf" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install wptools\n", | |
"!pip install datetime\n", | |
"!pip install pafy\n", | |
"!pip install --upgrade youtube_dl" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"cellView": "form", | |
"id": "4sPO8Ao_LuCG" | |
}, | |
"source": [ | |
"## Download `vox-celeb` meta file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"cellView": "form", | |
"id": "4sPO8Ao_LuCG" | |
}, | |
"outputs": [], | |
"source": [ | |
"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_test_txt.zip\n", | |
"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_dev_txt.zip\n", | |
"!zipinfo -1 vox2_dev_txt.zip '*/' > dev.txt\n", | |
"!zipinfo -1 vox2_test_txt.zip '*/' > test.txt" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "bWcVgXzMCr_F" | |
}, | |
"source": [ | |
"## Parse birth date from wikipedia" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"cellView": "form", | |
"id": "0iHwet85dxjU" | |
}, | |
"outputs": [], | |
"source": [ | |
"#@title Parse the celebs' age from wikipedia\n", | |
"import csv\n", | |
"import wptools as wp\n", | |
"from datetime import datetime as dt\n", | |
"\n", | |
"meta = open('data.csv', 'r+')\n", | |
"reader = csv.reader(meta)\n", | |
"results = open('/content/drive/My Drive/VoxCeleb_Age_Wiki.csv', 'w')\n", | |
"writer = csv.writer(results)\n", | |
"a = 0\n", | |
"for row in reader:\n", | |
" if a < 5660:\n", | |
" a = a + 1\n", | |
" continue;\n", | |
" else:\n", | |
" print('success!')\n", | |
" print(row[4])\n", | |
" page = wp.page(row[4])\n", | |
" page.wanted_labels(['P569', 'P21', 'P31'])\n", | |
" try:\n", | |
" page.get_wikidata()\n", | |
" orig = page.data['wikidata']['date of birth (P569)']\n", | |
" today = dt.today()\n", | |
" if type(orig) is list:\n", | |
" birth = dt.strptime(orig[0], '+%Y-%m-%dT%H:%M:%SZ')\n", | |
" else:\n", | |
" birth = dt.strptime(orig, '+%Y-%m-%dT%H:%M:%SZ')\n", | |
" days = today - birth\n", | |
" age = int(days.days / 365)\n", | |
" row[5] = str(age)\n", | |
" writer.writerow(row)\n", | |
" print(row)\n", | |
" except:\n", | |
" writer.writerow(row)\n", | |
" continue" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"cellView": "form", | |
"id": "qIdQMsF6_s-I" | |
}, | |
"source": [ | |
"## Download free proxy list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"cellView": "form", | |
"id": "qIdQMsF6_s-I" | |
}, | |
"outputs": [], | |
"source": [ | |
"!curl -sSf \"https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt\" > proxy-list.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "nQMzxDwuFdLq" | |
}, | |
"outputs": [], | |
"source": [ | |
"meta = open('/content/drive/My Drive/datasets/dev.txt', 'r')\n", | |
"line = meta.readline()\n", | |
"count = 0\n", | |
"batch = 1\n", | |
"metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n", | |
"while line:\n", | |
" line = meta.readline()\n", | |
" if len(line) == 25:\n", | |
" metaOut.write(line)\n", | |
" count = count + 1\n", | |
" if count >= 500:\n", | |
" print('五百个啦!还有好多!', batch)\n", | |
" count = 0\n", | |
" batch = batch + 1\n", | |
" metaOut.close()\n", | |
" metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n", | |
"metaOut.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parsing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 368 | |
}, | |
"id": "vFySMm3HRZNJ", | |
"outputId": "56880f3d-be74-41ac-e56c-daf074d39809" | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import csv\n", | |
"import youtube_dl\n", | |
"import time\n", | |
"import datetime\n", | |
"import logging\n", | |
"logger = logging.getLogger(__name__)\n", | |
"f_handler = logging.FileHandler('/content/drive/My Drive/outputnov12.txt')\n", | |
"f_handler.setLevel(logging.DEBUG)\n", | |
"logger.addHandler(f_handler)\n", | |
"proxyList = open('proxy-list.txt', 'r')\n", | |
"proxy = proxyList.readline()\n", | |
"print('NEW PROXY:', 'socks5://'+proxy)\n", | |
"ydl_opts = {\n", | |
" 'outtmpl': '%(upload_date)s',\n", | |
" 'skip_download': True,\n", | |
" 'simulate': True,\n", | |
" 'slient': True,\n", | |
" 'logger': logger,\n", | |
" 'proxy': 'socks5://'+proxy,\n", | |
" 'socket_timeout': 5\n", | |
"}\n", | |
"ydl = youtube_dl.YoutubeDL(ydl_opts)\n", | |
"errBuf = 0\n", | |
"csvOut = open('/content/drive/My Drive/datasets/100p_upload_date.csv', 'a')\n", | |
"wrongOut = open('/content/drive/My Drive/datasets/wrongOut.txt', 'a')\n", | |
"writer = csv.writer(csvOut)\n", | |
"count = 0\n", | |
"for metaBatch in range(100, 291):\n", | |
" with open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(metaBatch)+'.txt', 'r') as fp:\n", | |
" csvOut.flush()\n", | |
" wrongOut.flush()\n", | |
" print('metaBatch:', metaBatch, 'opened.')\n", | |
" line = fp.readline()\n", | |
" while line:\n", | |
" if len(line) == 25:\n", | |
" head, tail = os.path.split(line)\n", | |
" head, ytid = os.path.split(head)\n", | |
" head, uuid = os.path.split(head)\n", | |
" try:\n", | |
" csvOut.flush()\n", | |
" wrongOut.flush()\n", | |
" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n", | |
" current = [uuid, ytid, info['upload_date'], 'dev']\n", | |
" writer.writerow(current)\n", | |
" errBuf = 0\n", | |
" print(datetime.datetime.now(), uuid, ytid, info['upload_date'], 'dev', metaBatch, count)\n", | |
" except:\n", | |
" current = [uuid, ytid, '#REFUSED', 'dev', metaBatch]\n", | |
" wrongOut.writelines(line)\n", | |
" errBuf = errBuf + 1\n", | |
" print(datetime.datetime.now(), uuid, ytid, '#REFUSED', 'dev', metaBatch, count)\n", | |
" line = fp.readline()\n", | |
" count = count + 1\n", | |
" \n", | |
" if errBuf >= 20 :\n", | |
" csvOut.flush()\n", | |
" wrongOut.flush()\n", | |
" count = 0\n", | |
" errBuf = 0\n", | |
" try:\n", | |
" proxy = proxyList.readline()\n", | |
" ydl_opts['proxy'] = 'socks5://'+proxy\n", | |
" print('NEW PROXY:', 'socks5://'+proxy)\n", | |
" ydl = youtube_dl.YoutubeDL(ydl_opts)\n", | |
" except:\n", | |
" pass\n", | |
" print('metaBatch:', metaBatch, 'Finished.')\n", | |
" \n", | |
"csvOut = open('/content/drive/My Drive/datasets/testOut.csv', 'w')\n", | |
"writer = csv.writer(csvOut)\n", | |
"with open('/content/drive/My Drive/datasets/test.txt', 'r') as fp:\n", | |
" line = fp.readline()\n", | |
" while line:\n", | |
" if len(line) == 25:\n", | |
" head, tail = os.path.split(line)\n", | |
" head, ytid = os.path.split(head)\n", | |
" head, uuid = os.path.split(head)\n", | |
" try:\n", | |
" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n", | |
" current = [uuid, ytid, info['upload_date'], 'dev']\n", | |
" writer.writerow(current)\n", | |
" except:\n", | |
" current = [uuid, ytid, 'TIMEDOUT', 'test']\n", | |
" print(current)\n", | |
" line = fp.readline()\n", | |
" fp.close()" | |
] | |
} | |
], | |
"metadata": { | |
"accelerator": "GPU", | |
"colab": { | |
"collapsed_sections": [], | |
"name": "Generate AGE label for Vox-Celeb dataset.ipynb", | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment