dlmrr · October 22, 2021 18:01
diff --git a/facebook_extraction.ipynb b/facebook_extraction.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab34207b-cd0c-4750-b7a5-cc67e2515077",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "import rarfile\n",
    "import pandas as pd\n",
    "from glob import glob\n",
    "import io\n",
    "import re\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52f67be4-db9c-4674-9a66-8bd103d29db7",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "This code didn't work for some countries, because of unusual format, errors or too big to fit in memory.\n",
    "For those countries I took a custom approach that I won't detail here but in general I manually unziped and  split the file in chunks with the command line \n",
    "before processing them.\n",
    "Those countries were : Italy, Algeria, Morocco, Turkey, Sudan, Jordan, Palestine.\n",
    "Countries I didn't process because I could't find a way or were too big : Saudi Arabia and Egypt.\n",
    "\"\"\"\n",
    "\n",
    "def get_sep(file): #The separator is sometimes a comma and sometimes a colon. This function finds the kind of separator of the file.\n",
    "\n",
    "    head_a = str(file[:5000])\n",
    "    comma_count = head_a.count(\",\")\n",
    "    colon_count = head_a.count(\":\")\n",
    "    if comma_count > colon_count :\n",
    "        sep = \",\"\n",
    "    else:\n",
    "        sep=\":\"\n",
    "    return sep\n",
    "\n",
    "#The files are too big to unzip before loading for my hard drive so I load them directly from the zip file. \n",
    "def get_zip_df(zip_file):\n",
    "    \n",
    "    df_list = []\n",
    "    \n",
    "    with zipfile.ZipFile(zip_file, 'r') as archive: \n",
    "        \n",
    "        files = archive.namelist() #listing all the files in the archive\n",
    "        \n",
    "        for file in files : \n",
    "            \n",
    "            print(file)\n",
    "            \n",
    "            with archive.open(file) as myfile:\n",
    "                a = myfile.read()\n",
    "                sep = get_sep(a)\n",
    "                \n",
    "                try:\n",
    "                    chunks_list = [] #loading files in chunks to avoid crashing\n",
    "                    \n",
    "                    for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
    "                        chunks_list.append(chunk)\n",
    "                        \n",
    "                    df = pd.concat(chunks_list)\n",
    "                    \n",
    "                except:\n",
    "                    chunks_list = []\n",
    "                    print(\"switching to python engine\")  #sometimes the c engine will crash, in that case the python engine can sometimes do the trick\n",
    "                    \n",
    "                    for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
    "                        chunks_list.append(chunk)\n",
    "                        \n",
    "                    df = pd.concat(chunks_list)\n",
    "\n",
    "                df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
    "                df_list.append(df)\n",
    "                \n",
    "    df = pd.concat(df_list)\n",
    "    return df\n",
    "\n",
    "def get_rar_df(zip_file):\n",
    "    df_list = []\n",
    "    with rarfile.RarFile(zip_file, 'r') as archive: \n",
    "        files = archive.namelist()\n",
    "        print(files)\n",
    "        for file in files : \n",
    "            print(file)\n",
    "            with archive.open(file) as myfile:\n",
    "                a = myfile.read()\n",
    "                sep = get_sep(a)\n",
    "                try:\n",
    "                    chunks_list = []\n",
    "                    for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
    "                        chunks_list.append(chunk)\n",
    "                    df = pd.concat(chunks_list)\n",
    "                except:\n",
    "                    chunks_list = []\n",
    "                    print(\"switching to python engine\")\n",
    "                    for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
    "                        chunks_list.append(chunk)\n",
    "                    df = pd.concat(chunks_list)\n",
    "\n",
    "                df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
    "                df_list.append(df)\n",
    "    df = pd.concat(df_list)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bc71c2a-a70d-430f-a604-ef714f4a86ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "zip_files = glob(\"*.zip\")\n",
    "print(len(zip_files))\n",
    "\n",
    "for n,file in enumerate(zip_files):\n",
    "    \n",
    "    print(\"Country n°\", n)\n",
    "    country = file.replace(\".zip\",\"\")\n",
    "    print(country)\n",
    "    \n",
    "    df = get_zip_df(file)\n",
    "    \n",
    "    surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
    "    print(\"len surnames : \", len(surnames))\n",
    "    surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
    "                    \n",
    "    first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
    "    print(\"len first names : \", len(first_names))\n",
    "    first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
    "               \n",
    "    \n",
    "    print(country, \" done\")\n",
    "    print(\"\")\n",
    "\n",
    "rar_files = glob(\"*.rar\")\n",
    "\n",
    "for file in rar_files:\n",
    "    \n",
    "    country = file.replace(\".rar\",\"\")\n",
    "    print(country)\n",
    "    \n",
    "    df = get_rar_df(file)\n",
    "    \n",
    "    surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
    "    print(\"len surnames : \", len(surnames))\n",
    "    surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
    "                    \n",
    "    first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
    "    print(\"len first names : \", len(first_names))\n",
    "    first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
    "               \n",
    "    \n",
    "    print(country, \" done\")\n",
    "    print(\"\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base_data",
   "language": "python",
   "name": "base_data"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ab34207b-cd0c-4750-b7a5-cc67e2515077",
	"metadata": {},
	"outputs": [],
	"source": [
	"import zipfile\n",
	"import rarfile\n",
	"import pandas as pd\n",
	"from glob import glob\n",
	"import io\n",
	"import re\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "52f67be4-db9c-4674-9a66-8bd103d29db7",
	"metadata": {},
	"outputs": [],
	"source": [
	"\"\"\"\n",
	"This code didn't work for some countries, because of unusual format, errors or too big to fit in memory.\n",
	"For those countries I took a custom approach that I won't detail here but in general I manually unziped and split the file in chunks with the command line \n",
	"before processing them.\n",
	"Those countries were : Italy, Algeria, Morocco, Turkey, Sudan, Jordan, Palestine.\n",
	"Countries I didn't process because I could't find a way or were too big : Saudi Arabia and Egypt.\n",
	"\"\"\"\n",
	"\n",
	"def get_sep(file): #The separator is sometimes a comma and sometimes a colon. This function finds the kind of separator of the file.\n",
	"\n",
	" head_a = str(file[:5000])\n",
	" comma_count = head_a.count(\",\")\n",
	" colon_count = head_a.count(\":\")\n",
	" if comma_count > colon_count :\n",
	" sep = \",\"\n",
	" else:\n",
	" sep=\":\"\n",
	" return sep\n",
	"\n",
	"#The files are too big to unzip before loading for my hard drive so I load them directly from the zip file. \n",
	"def get_zip_df(zip_file):\n",
	" \n",
	" df_list = []\n",
	" \n",
	" with zipfile.ZipFile(zip_file, 'r') as archive: \n",
	" \n",
	" files = archive.namelist() #listing all the files in the archive\n",
	" \n",
	" for file in files : \n",
	" \n",
	" print(file)\n",
	" \n",
	" with archive.open(file) as myfile:\n",
	" a = myfile.read()\n",
	" sep = get_sep(a)\n",
	" \n",
	" try:\n",
	" chunks_list = [] #loading files in chunks to avoid crashing\n",
	" \n",
	" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
	" chunks_list.append(chunk)\n",
	" \n",
	" df = pd.concat(chunks_list)\n",
	" \n",
	" except:\n",
	" chunks_list = []\n",
	" print(\"switching to python engine\") #sometimes the c engine will crash, in that case the python engine can sometimes do the trick\n",
	" \n",
	" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
	" chunks_list.append(chunk)\n",
	" \n",
	" df = pd.concat(chunks_list)\n",
	"\n",
	" df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
	" df_list.append(df)\n",
	" \n",
	" df = pd.concat(df_list)\n",
	" return df\n",
	"\n",
	"def get_rar_df(zip_file):\n",
	" df_list = []\n",
	" with rarfile.RarFile(zip_file, 'r') as archive: \n",
	" files = archive.namelist()\n",
	" print(files)\n",
	" for file in files : \n",
	" print(file)\n",
	" with archive.open(file) as myfile:\n",
	" a = myfile.read()\n",
	" sep = get_sep(a)\n",
	" try:\n",
	" chunks_list = []\n",
	" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
	" chunks_list.append(chunk)\n",
	" df = pd.concat(chunks_list)\n",
	" except:\n",
	" chunks_list = []\n",
	" print(\"switching to python engine\")\n",
	" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
	" chunks_list.append(chunk)\n",
	" df = pd.concat(chunks_list)\n",
	"\n",
	" df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
	" df_list.append(df)\n",
	" df = pd.concat(df_list)\n",
	" return df\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "1bc71c2a-a70d-430f-a604-ef714f4a86ee",
	"metadata": {},
	"outputs": [],
	"source": [
	"zip_files = glob(\"*.zip\")\n",
	"print(len(zip_files))\n",
	"\n",
	"for n,file in enumerate(zip_files):\n",
	" \n",
	" print(\"Country n°\", n)\n",
	" country = file.replace(\".zip\",\"\")\n",
	" print(country)\n",
	" \n",
	" df = get_zip_df(file)\n",
	" \n",
	" surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
	" print(\"len surnames : \", len(surnames))\n",
	" surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
	" \n",
	" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
	" print(\"len first names : \", len(first_names))\n",
	" first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
	" \n",
	" \n",
	" print(country, \" done\")\n",
	" print(\"\")\n",
	"\n",
	"rar_files = glob(\"*.rar\")\n",
	"\n",
	"for file in rar_files:\n",
	" \n",
	" country = file.replace(\".rar\",\"\")\n",
	" print(country)\n",
	" \n",
	" df = get_rar_df(file)\n",
	" \n",
	" surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
	" print(\"len surnames : \", len(surnames))\n",
	" surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
	" \n",
	" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
	" print(\"len first names : \", len(first_names))\n",
	" first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
	" \n",
	" \n",
	" print(country, \" done\")\n",
	" print(\"\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "base_data",
	"language": "python",
	"name": "base_data"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}