Skip to content

Instantly share code, notes, and snippets.

@dlmrr
Created October 22, 2021 18:01
Show Gist options
  • Save dlmrr/f445c5237aac70c46151d5fa46f58c9c to your computer and use it in GitHub Desktop.
Save dlmrr/f445c5237aac70c46151d5fa46f58c9c to your computer and use it in GitHub Desktop.
extracting first names and last names for the 2021 facebook leak
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ab34207b-cd0c-4750-b7a5-cc67e2515077",
"metadata": {},
"outputs": [],
"source": [
"import zipfile\n",
"import rarfile\n",
"import pandas as pd\n",
"from glob import glob\n",
"import io\n",
"import re\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52f67be4-db9c-4674-9a66-8bd103d29db7",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"This code didn't work for some countries, because of unusual format, errors or too big to fit in memory.\n",
"For those countries I took a custom approach that I won't detail here but in general I manually unziped and split the file in chunks with the command line \n",
"before processing them.\n",
"Those countries were : Italy, Algeria, Morocco, Turkey, Sudan, Jordan, Palestine.\n",
"Countries I didn't process because I could't find a way or were too big : Saudi Arabia and Egypt.\n",
"\"\"\"\n",
"\n",
"def get_sep(file): #The separator is sometimes a comma and sometimes a colon. This function finds the kind of separator of the file.\n",
"\n",
" head_a = str(file[:5000])\n",
" comma_count = head_a.count(\",\")\n",
" colon_count = head_a.count(\":\")\n",
" if comma_count > colon_count :\n",
" sep = \",\"\n",
" else:\n",
" sep=\":\"\n",
" return sep\n",
"\n",
"#The files are too big to unzip before loading for my hard drive so I load them directly from the zip file. \n",
"def get_zip_df(zip_file):\n",
" \n",
" df_list = []\n",
" \n",
" with zipfile.ZipFile(zip_file, 'r') as archive: \n",
" \n",
" files = archive.namelist() #listing all the files in the archive\n",
" \n",
" for file in files : \n",
" \n",
" print(file)\n",
" \n",
" with archive.open(file) as myfile:\n",
" a = myfile.read()\n",
" sep = get_sep(a)\n",
" \n",
" try:\n",
" chunks_list = [] #loading files in chunks to avoid crashing\n",
" \n",
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
" chunks_list.append(chunk)\n",
" \n",
" df = pd.concat(chunks_list)\n",
" \n",
" except:\n",
" chunks_list = []\n",
" print(\"switching to python engine\") #sometimes the c engine will crash, in that case the python engine can sometimes do the trick\n",
" \n",
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
" chunks_list.append(chunk)\n",
" \n",
" df = pd.concat(chunks_list)\n",
"\n",
" df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
" df_list.append(df)\n",
" \n",
" df = pd.concat(df_list)\n",
" return df\n",
"\n",
"def get_rar_df(zip_file):\n",
" df_list = []\n",
" with rarfile.RarFile(zip_file, 'r') as archive: \n",
" files = archive.namelist()\n",
" print(files)\n",
" for file in files : \n",
" print(file)\n",
" with archive.open(file) as myfile:\n",
" a = myfile.read()\n",
" sep = get_sep(a)\n",
" try:\n",
" chunks_list = []\n",
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n",
" chunks_list.append(chunk)\n",
" df = pd.concat(chunks_list)\n",
" except:\n",
" chunks_list = []\n",
" print(\"switching to python engine\")\n",
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n",
" chunks_list.append(chunk)\n",
" df = pd.concat(chunks_list)\n",
"\n",
" df.columns = [\"first_name\",\"surname\",\"gender\"]\n",
" df_list.append(df)\n",
" df = pd.concat(df_list)\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1bc71c2a-a70d-430f-a604-ef714f4a86ee",
"metadata": {},
"outputs": [],
"source": [
"zip_files = glob(\"*.zip\")\n",
"print(len(zip_files))\n",
"\n",
"for n,file in enumerate(zip_files):\n",
" \n",
" print(\"Country n°\", n)\n",
" country = file.replace(\".zip\",\"\")\n",
" print(country)\n",
" \n",
" df = get_zip_df(file)\n",
" \n",
" surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
" print(\"len surnames : \", len(surnames))\n",
" surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
" \n",
" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
" print(\"len first names : \", len(first_names))\n",
" first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
" \n",
" \n",
" print(country, \" done\")\n",
" print(\"\")\n",
"\n",
"rar_files = glob(\"*.rar\")\n",
"\n",
"for file in rar_files:\n",
" \n",
" country = file.replace(\".rar\",\"\")\n",
" print(country)\n",
" \n",
" df = get_rar_df(file)\n",
" \n",
" surnames = df[\"surname\"].value_counts().rename(\"count\")\n",
" print(\"len surnames : \", len(surnames))\n",
" surnames.to_csv(f'surnames/{country}_surnames.csv')\n",
" \n",
" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n",
" print(\"len first names : \", len(first_names))\n",
" first_names.to_csv(f'first_names/{country}_first_names.csv')\n",
" \n",
" \n",
" print(country, \" done\")\n",
" print(\"\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base_data",
"language": "python",
"name": "base_data"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment