Created
October 22, 2021 18:01
-
-
Save dlmrr/f445c5237aac70c46151d5fa46f58c9c to your computer and use it in GitHub Desktop.
extracting first names and last names for the 2021 facebook leak
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ab34207b-cd0c-4750-b7a5-cc67e2515077", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import zipfile\n", | |
"import rarfile\n", | |
"import pandas as pd\n", | |
"from glob import glob\n", | |
"import io\n", | |
"import re\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "52f67be4-db9c-4674-9a66-8bd103d29db7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"This code didn't work for some countries, because of unusual format, errors or too big to fit in memory.\n", | |
"For those countries I took a custom approach that I won't detail here but in general I manually unziped and split the file in chunks with the command line \n", | |
"before processing them.\n", | |
"Those countries were : Italy, Algeria, Morocco, Turkey, Sudan, Jordan, Palestine.\n", | |
"Countries I didn't process because I could't find a way or were too big : Saudi Arabia and Egypt.\n", | |
"\"\"\"\n", | |
"\n", | |
"def get_sep(file): #The separator is sometimes a comma and sometimes a colon. This function finds the kind of separator of the file.\n", | |
"\n", | |
" head_a = str(file[:5000])\n", | |
" comma_count = head_a.count(\",\")\n", | |
" colon_count = head_a.count(\":\")\n", | |
" if comma_count > colon_count :\n", | |
" sep = \",\"\n", | |
" else:\n", | |
" sep=\":\"\n", | |
" return sep\n", | |
"\n", | |
"#The files are too big to unzip before loading for my hard drive so I load them directly from the zip file. \n", | |
"def get_zip_df(zip_file):\n", | |
" \n", | |
" df_list = []\n", | |
" \n", | |
" with zipfile.ZipFile(zip_file, 'r') as archive: \n", | |
" \n", | |
" files = archive.namelist() #listing all the files in the archive\n", | |
" \n", | |
" for file in files : \n", | |
" \n", | |
" print(file)\n", | |
" \n", | |
" with archive.open(file) as myfile:\n", | |
" a = myfile.read()\n", | |
" sep = get_sep(a)\n", | |
" \n", | |
" try:\n", | |
" chunks_list = [] #loading files in chunks to avoid crashing\n", | |
" \n", | |
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n", | |
" chunks_list.append(chunk)\n", | |
" \n", | |
" df = pd.concat(chunks_list)\n", | |
" \n", | |
" except:\n", | |
" chunks_list = []\n", | |
" print(\"switching to python engine\") #sometimes the c engine will crash, in that case the python engine can sometimes do the trick\n", | |
" \n", | |
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n", | |
" chunks_list.append(chunk)\n", | |
" \n", | |
" df = pd.concat(chunks_list)\n", | |
"\n", | |
" df.columns = [\"first_name\",\"surname\",\"gender\"]\n", | |
" df_list.append(df)\n", | |
" \n", | |
" df = pd.concat(df_list)\n", | |
" return df\n", | |
"\n", | |
"def get_rar_df(zip_file):\n", | |
" df_list = []\n", | |
" with rarfile.RarFile(zip_file, 'r') as archive: \n", | |
" files = archive.namelist()\n", | |
" print(files)\n", | |
" for file in files : \n", | |
" print(file)\n", | |
" with archive.open(file) as myfile:\n", | |
" a = myfile.read()\n", | |
" sep = get_sep(a)\n", | |
" try:\n", | |
" chunks_list = []\n", | |
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"c\",dtype=pd.StringDtype(),chunksize=1000000):\n", | |
" chunks_list.append(chunk)\n", | |
" df = pd.concat(chunks_list)\n", | |
" except:\n", | |
" chunks_list = []\n", | |
" print(\"switching to python engine\")\n", | |
" for chunk in pd.read_table(io.StringIO(a.decode(\"utf-8\")),sep=sep,header=None,usecols=[2,3,4], quoting=3,engine=\"python\",dtype=pd.StringDtype(),chunksize=1000000):\n", | |
" chunks_list.append(chunk)\n", | |
" df = pd.concat(chunks_list)\n", | |
"\n", | |
" df.columns = [\"first_name\",\"surname\",\"gender\"]\n", | |
" df_list.append(df)\n", | |
" df = pd.concat(df_list)\n", | |
" return df\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "1bc71c2a-a70d-430f-a604-ef714f4a86ee", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"zip_files = glob(\"*.zip\")\n", | |
"print(len(zip_files))\n", | |
"\n", | |
"for n,file in enumerate(zip_files):\n", | |
" \n", | |
" print(\"Country n°\", n)\n", | |
" country = file.replace(\".zip\",\"\")\n", | |
" print(country)\n", | |
" \n", | |
" df = get_zip_df(file)\n", | |
" \n", | |
" surnames = df[\"surname\"].value_counts().rename(\"count\")\n", | |
" print(\"len surnames : \", len(surnames))\n", | |
" surnames.to_csv(f'surnames/{country}_surnames.csv')\n", | |
" \n", | |
" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n", | |
" print(\"len first names : \", len(first_names))\n", | |
" first_names.to_csv(f'first_names/{country}_first_names.csv')\n", | |
" \n", | |
" \n", | |
" print(country, \" done\")\n", | |
" print(\"\")\n", | |
"\n", | |
"rar_files = glob(\"*.rar\")\n", | |
"\n", | |
"for file in rar_files:\n", | |
" \n", | |
" country = file.replace(\".rar\",\"\")\n", | |
" print(country)\n", | |
" \n", | |
" df = get_rar_df(file)\n", | |
" \n", | |
" surnames = df[\"surname\"].value_counts().rename(\"count\")\n", | |
" print(\"len surnames : \", len(surnames))\n", | |
" surnames.to_csv(f'surnames/{country}_surnames.csv')\n", | |
" \n", | |
" first_names = df[[\"first_name\",\"gender\"]].value_counts().rename(\"count\")\n", | |
" print(\"len first names : \", len(first_names))\n", | |
" first_names.to_csv(f'first_names/{country}_first_names.csv')\n", | |
" \n", | |
" \n", | |
" print(country, \" done\")\n", | |
" print(\"\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "base_data", | |
"language": "python", | |
"name": "base_data" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment