km4sh · August 14, 2021 00:39
diff --git a/Generate AGE labels for Vox-Celeb dataset.ipynb b/Generate AGE labels for Vox-Celeb dataset.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate AGE Label for Vox-Celeb Dataset (Out-of-date)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cellView": "form",
    "id": "SYB8rOXt_WRf"
   },
   "source": [
    "## Install dependencies via `pip`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "id": "SYB8rOXt_WRf"
   },
   "outputs": [],
   "source": [
    "!pip install wptools\n",
    "!pip install datetime\n",
    "!pip install pafy\n",
    "!pip install --upgrade youtube_dl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cellView": "form",
    "id": "4sPO8Ao_LuCG"
   },
   "source": [
    "## Download `vox-celeb` meta file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "id": "4sPO8Ao_LuCG"
   },
   "outputs": [],
   "source": [
    "!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_test_txt.zip\n",
    "!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_dev_txt.zip\n",
    "!zipinfo -1 vox2_dev_txt.zip '*/' > dev.txt\n",
    "!zipinfo -1 vox2_test_txt.zip '*/' > test.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bWcVgXzMCr_F"
   },
   "source": [
    "## Parse birth date from wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "id": "0iHwet85dxjU"
   },
   "outputs": [],
   "source": [
    "#@title Parse the celebs' age from wikipedia\n",
    "import csv\n",
    "import wptools as wp\n",
    "from datetime import datetime as dt\n",
    "\n",
    "meta = open('data.csv', 'r+')\n",
    "reader = csv.reader(meta)\n",
    "results = open('/content/drive/My Drive/VoxCeleb_Age_Wiki.csv', 'w')\n",
    "writer = csv.writer(results)\n",
    "a = 0\n",
    "for row in reader:\n",
    "  if a < 5660:\n",
    "    a = a + 1\n",
    "    continue;\n",
    "  else:\n",
    "    print('success!')\n",
    "    print(row[4])\n",
    "    page = wp.page(row[4])\n",
    "    page.wanted_labels(['P569', 'P21', 'P31'])\n",
    "    try:\n",
    "      page.get_wikidata()\n",
    "      orig = page.data['wikidata']['date of birth (P569)']\n",
    "      today = dt.today()\n",
    "      if type(orig) is list:\n",
    "        birth = dt.strptime(orig[0], '+%Y-%m-%dT%H:%M:%SZ')\n",
    "      else:\n",
    "        birth = dt.strptime(orig, '+%Y-%m-%dT%H:%M:%SZ')\n",
    "      days = today - birth\n",
    "      age = int(days.days / 365)\n",
    "      row[5] = str(age)\n",
    "      writer.writerow(row)\n",
    "      print(row)\n",
    "    except:\n",
    "      writer.writerow(row)\n",
    "      continue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cellView": "form",
    "id": "qIdQMsF6_s-I"
   },
   "source": [
    "## Download free proxy list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "id": "qIdQMsF6_s-I"
   },
   "outputs": [],
   "source": [
    "!curl -sSf \"https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt\" > proxy-list.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "nQMzxDwuFdLq"
   },
   "outputs": [],
   "source": [
    "meta = open('/content/drive/My Drive/datasets/dev.txt', 'r')\n",
    "line = meta.readline()\n",
    "count = 0\n",
    "batch = 1\n",
    "metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
    "while line:\n",
    "  line = meta.readline()\n",
    "  if len(line) == 25:\n",
    "    metaOut.write(line)\n",
    "    count = count + 1\n",
    "    if count >= 500:\n",
    "      print('五百个啦！还有好多！', batch)\n",
    "      count = 0\n",
    "      batch = batch + 1\n",
    "      metaOut.close()\n",
    "      metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
    "metaOut.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 368
    },
    "id": "vFySMm3HRZNJ",
    "outputId": "56880f3d-be74-41ac-e56c-daf074d39809"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import youtube_dl\n",
    "import time\n",
    "import datetime\n",
    "import logging\n",
    "logger = logging.getLogger(__name__)\n",
    "f_handler = logging.FileHandler('/content/drive/My Drive/outputnov12.txt')\n",
    "f_handler.setLevel(logging.DEBUG)\n",
    "logger.addHandler(f_handler)\n",
    "proxyList =  open('proxy-list.txt', 'r')\n",
    "proxy = proxyList.readline()\n",
    "print('NEW PROXY:', 'socks5://'+proxy)\n",
    "ydl_opts = {\n",
    "    'outtmpl': '%(upload_date)s',\n",
    "    'skip_download': True,\n",
    "    'simulate': True,\n",
    "    'slient': True,\n",
    "    'logger': logger,\n",
    "    'proxy': 'socks5://'+proxy,\n",
    "    'socket_timeout': 5\n",
    "}\n",
    "ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
    "errBuf = 0\n",
    "csvOut = open('/content/drive/My Drive/datasets/100p_upload_date.csv', 'a')\n",
    "wrongOut = open('/content/drive/My Drive/datasets/wrongOut.txt', 'a')\n",
    "writer = csv.writer(csvOut)\n",
    "count = 0\n",
    "for metaBatch in range(100, 291):\n",
    "  with open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(metaBatch)+'.txt', 'r') as fp:\n",
    "    csvOut.flush()\n",
    "    wrongOut.flush()\n",
    "    print('metaBatch:', metaBatch, 'opened.')\n",
    "    line = fp.readline()\n",
    "    while line:\n",
    "      if len(line) == 25:\n",
    "        head, tail = os.path.split(line)\n",
    "        head, ytid = os.path.split(head)\n",
    "        head, uuid = os.path.split(head)\n",
    "        try:\n",
    "          csvOut.flush()\n",
    "          wrongOut.flush()\n",
    "          info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
    "          current = [uuid, ytid, info['upload_date'], 'dev']\n",
    "          writer.writerow(current)\n",
    "          errBuf = 0\n",
    "          print(datetime.datetime.now(), uuid, ytid, info['upload_date'], 'dev', metaBatch, count)\n",
    "        except:\n",
    "          current = [uuid, ytid, '#REFUSED', 'dev', metaBatch]\n",
    "          wrongOut.writelines(line)\n",
    "          errBuf = errBuf + 1\n",
    "          print(datetime.datetime.now(), uuid, ytid, '#REFUSED', 'dev', metaBatch, count)\n",
    "      line = fp.readline()\n",
    "      count = count + 1\n",
    "      \n",
    "      if errBuf >= 20 :\n",
    "        csvOut.flush()\n",
    "        wrongOut.flush()\n",
    "        count = 0\n",
    "        errBuf = 0\n",
    "        try:\n",
    "          proxy = proxyList.readline()\n",
    "          ydl_opts['proxy'] = 'socks5://'+proxy\n",
    "          print('NEW PROXY:', 'socks5://'+proxy)\n",
    "          ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
    "        except:\n",
    "          pass\n",
    "  print('metaBatch:', metaBatch, 'Finished.')\n",
    "  \n",
    "csvOut = open('/content/drive/My Drive/datasets/testOut.csv', 'w')\n",
    "writer = csv.writer(csvOut)\n",
    "with open('/content/drive/My Drive/datasets/test.txt', 'r') as fp:\n",
    "  line = fp.readline()\n",
    "  while line:\n",
    "    if len(line) == 25:\n",
    "      head, tail = os.path.split(line)\n",
    "      head, ytid = os.path.split(head)\n",
    "      head, uuid = os.path.split(head)\n",
    "      try:\n",
    "        info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
    "        current = [uuid, ytid, info['upload_date'], 'dev']\n",
    "        writer.writerow(current)\n",
    "      except:\n",
    "        current = [uuid, ytid, 'TIMEDOUT', 'test']\n",
    "      print(current)\n",
    "    line = fp.readline()\n",
    "  fp.close()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Generate AGE label for Vox-Celeb dataset.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Generate AGE Label for Vox-Celeb Dataset (Out-of-date)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"cellView": "form",
	"id": "SYB8rOXt_WRf"
	},
	"source": [
	"## Install dependencies via `pip`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"cellView": "form",
	"id": "SYB8rOXt_WRf"
	},
	"outputs": [],
	"source": [
	"!pip install wptools\n",
	"!pip install datetime\n",
	"!pip install pafy\n",
	"!pip install --upgrade youtube_dl"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"cellView": "form",
	"id": "4sPO8Ao_LuCG"
	},
	"source": [
	"## Download `vox-celeb` meta file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"cellView": "form",
	"id": "4sPO8Ao_LuCG"
	},
	"outputs": [],
	"source": [
	"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_test_txt.zip\n",
	"!wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox2_dev_txt.zip\n",
	"!zipinfo -1 vox2_dev_txt.zip '*/' > dev.txt\n",
	"!zipinfo -1 vox2_test_txt.zip '*/' > test.txt"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "bWcVgXzMCr_F"
	},
	"source": [
	"## Parse birth date from wikipedia"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"cellView": "form",
	"id": "0iHwet85dxjU"
	},
	"outputs": [],
	"source": [
	"#@title Parse the celebs' age from wikipedia\n",
	"import csv\n",
	"import wptools as wp\n",
	"from datetime import datetime as dt\n",
	"\n",
	"meta = open('data.csv', 'r+')\n",
	"reader = csv.reader(meta)\n",
	"results = open('/content/drive/My Drive/VoxCeleb_Age_Wiki.csv', 'w')\n",
	"writer = csv.writer(results)\n",
	"a = 0\n",
	"for row in reader:\n",
	" if a < 5660:\n",
	" a = a + 1\n",
	" continue;\n",
	" else:\n",
	" print('success!')\n",
	" print(row[4])\n",
	" page = wp.page(row[4])\n",
	" page.wanted_labels(['P569', 'P21', 'P31'])\n",
	" try:\n",
	" page.get_wikidata()\n",
	" orig = page.data['wikidata']['date of birth (P569)']\n",
	" today = dt.today()\n",
	" if type(orig) is list:\n",
	" birth = dt.strptime(orig[0], '+%Y-%m-%dT%H:%M:%SZ')\n",
	" else:\n",
	" birth = dt.strptime(orig, '+%Y-%m-%dT%H:%M:%SZ')\n",
	" days = today - birth\n",
	" age = int(days.days / 365)\n",
	" row[5] = str(age)\n",
	" writer.writerow(row)\n",
	" print(row)\n",
	" except:\n",
	" writer.writerow(row)\n",
	" continue"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"cellView": "form",
	"id": "qIdQMsF6_s-I"
	},
	"source": [
	"## Download free proxy list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"cellView": "form",
	"id": "qIdQMsF6_s-I"
	},
	"outputs": [],
	"source": [
	"!curl -sSf \"https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt\" > proxy-list.txt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "nQMzxDwuFdLq"
	},
	"outputs": [],
	"source": [
	"meta = open('/content/drive/My Drive/datasets/dev.txt', 'r')\n",
	"line = meta.readline()\n",
	"count = 0\n",
	"batch = 1\n",
	"metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
	"while line:\n",
	" line = meta.readline()\n",
	" if len(line) == 25:\n",
	" metaOut.write(line)\n",
	" count = count + 1\n",
	" if count >= 500:\n",
	" print('五百个啦！还有好多！', batch)\n",
	" count = 0\n",
	" batch = batch + 1\n",
	" metaOut.close()\n",
	" metaOut = open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(batch)+'.txt', 'w')\n",
	"metaOut.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Parsing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 368
	},
	"id": "vFySMm3HRZNJ",
	"outputId": "56880f3d-be74-41ac-e56c-daf074d39809"
	},
	"outputs": [],
	"source": [
	"import os\n",
	"import csv\n",
	"import youtube_dl\n",
	"import time\n",
	"import datetime\n",
	"import logging\n",
	"logger = logging.getLogger(__name__)\n",
	"f_handler = logging.FileHandler('/content/drive/My Drive/outputnov12.txt')\n",
	"f_handler.setLevel(logging.DEBUG)\n",
	"logger.addHandler(f_handler)\n",
	"proxyList = open('proxy-list.txt', 'r')\n",
	"proxy = proxyList.readline()\n",
	"print('NEW PROXY:', 'socks5://'+proxy)\n",
	"ydl_opts = {\n",
	" 'outtmpl': '%(upload_date)s',\n",
	" 'skip_download': True,\n",
	" 'simulate': True,\n",
	" 'slient': True,\n",
	" 'logger': logger,\n",
	" 'proxy': 'socks5://'+proxy,\n",
	" 'socket_timeout': 5\n",
	"}\n",
	"ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
	"errBuf = 0\n",
	"csvOut = open('/content/drive/My Drive/datasets/100p_upload_date.csv', 'a')\n",
	"wrongOut = open('/content/drive/My Drive/datasets/wrongOut.txt', 'a')\n",
	"writer = csv.writer(csvOut)\n",
	"count = 0\n",
	"for metaBatch in range(100, 291):\n",
	" with open('/content/drive/My Drive/datasets/splitedDevMeta/dev_'+str(metaBatch)+'.txt', 'r') as fp:\n",
	" csvOut.flush()\n",
	" wrongOut.flush()\n",
	" print('metaBatch:', metaBatch, 'opened.')\n",
	" line = fp.readline()\n",
	" while line:\n",
	" if len(line) == 25:\n",
	" head, tail = os.path.split(line)\n",
	" head, ytid = os.path.split(head)\n",
	" head, uuid = os.path.split(head)\n",
	" try:\n",
	" csvOut.flush()\n",
	" wrongOut.flush()\n",
	" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
	" current = [uuid, ytid, info['upload_date'], 'dev']\n",
	" writer.writerow(current)\n",
	" errBuf = 0\n",
	" print(datetime.datetime.now(), uuid, ytid, info['upload_date'], 'dev', metaBatch, count)\n",
	" except:\n",
	" current = [uuid, ytid, '#REFUSED', 'dev', metaBatch]\n",
	" wrongOut.writelines(line)\n",
	" errBuf = errBuf + 1\n",
	" print(datetime.datetime.now(), uuid, ytid, '#REFUSED', 'dev', metaBatch, count)\n",
	" line = fp.readline()\n",
	" count = count + 1\n",
	" \n",
	" if errBuf >= 20 :\n",
	" csvOut.flush()\n",
	" wrongOut.flush()\n",
	" count = 0\n",
	" errBuf = 0\n",
	" try:\n",
	" proxy = proxyList.readline()\n",
	" ydl_opts['proxy'] = 'socks5://'+proxy\n",
	" print('NEW PROXY:', 'socks5://'+proxy)\n",
	" ydl = youtube_dl.YoutubeDL(ydl_opts)\n",
	" except:\n",
	" pass\n",
	" print('metaBatch:', metaBatch, 'Finished.')\n",
	" \n",
	"csvOut = open('/content/drive/My Drive/datasets/testOut.csv', 'w')\n",
	"writer = csv.writer(csvOut)\n",
	"with open('/content/drive/My Drive/datasets/test.txt', 'r') as fp:\n",
	" line = fp.readline()\n",
	" while line:\n",
	" if len(line) == 25:\n",
	" head, tail = os.path.split(line)\n",
	" head, ytid = os.path.split(head)\n",
	" head, uuid = os.path.split(head)\n",
	" try:\n",
	" info = ydl.extract_info('https://www.youtube.com/watch?v='+ytid, download=False)\n",
	" current = [uuid, ytid, info['upload_date'], 'dev']\n",
	" writer.writerow(current)\n",
	" except:\n",
	" current = [uuid, ytid, 'TIMEDOUT', 'test']\n",
	" print(current)\n",
	" line = fp.readline()\n",
	" fp.close()"
	]
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"collapsed_sections": [],
	"name": "Generate AGE label for Vox-Celeb dataset.ipynb",
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}