Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save natematias/5fedbc060c75d2825e7af7a31adb83ff to your computer and use it in GitHub Desktop.
Save natematias/5fedbc060c75d2825e7af7a31adb83ff to your computer and use it in GitHub Desktop.
Wikipedia newcomer validation
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merging Newcomer Dataframes with ORES scores for Newcomer Contributions\n",
"June 8, 2018 J. Nathan Matias\n",
"\n",
"Using data sources from http://paws-public.wmflabs.org/paws-public/User:Juliakamin/Querying%20new%20editors%20via%20sql.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline \n",
"import os, time, datetime, csv, glob, math, datetime, pprint\n",
"from collections import defaultdict\n",
"import matplotlib.pyplot as plt # Matplotlib for plotting\n",
"import pandas as pd\n",
"from dateutil import parser\n",
"lang = 'pt'\n",
"newcomer_file = os.path.join(\"data\", lang+\"_newcomer_list.csv\")\n",
"newcomer_revisions_files = glob.glob(\n",
" os.path.join('data', lang + \"_newcomer_revisions*.csv\"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"newcomers = {}\n",
"counter = 0\n",
"with open(newcomer_file, \"r\") as f:\n",
" for newcomer in csv.DictReader(f.readlines()):\n",
" ## REMOVE OUT THE PANDAS SEQUENTIAL INDEX\n",
" ## IF IT EXISTS\n",
" if '' in newcomer.keys():\n",
" del newcomer['']\n",
" newcomer['wiki'] = lang\n",
" newcomers[newcomer['user id']] = newcomer\n",
" counter += 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data/pt_newcomer_revisions.csv\n"
]
}
],
"source": [
"newcomer_revisions = defaultdict(list)\n",
"all_ids = set()\n",
"for filename in newcomer_revisions_files:\n",
" with open(filename, \"r\") as f:\n",
" print(filename)\n",
" for revision in csv.DictReader(f.readlines()):\n",
" revision['wiki'] = lang\n",
" revision_id = revision['revision id']\n",
" if('' in revision.keys()):\n",
" del revision['']\n",
" if(revision_id not in all_ids):\n",
" newcomer_revisions[revision['user id']].append(revision)\n",
" all_ids.add(revision_id)\n",
" \n",
"for key, revisions in newcomer_revisions.items():\n",
" newcomer_revisions[key] = sorted(revisions, \n",
" key=lambda x:parser.parse(x['revision time']))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Validation\n",
"Here, we confirm that every newcomer has at least one revision\n",
"And that there aren't any revisions that have no newcomer"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12337 total unique newcomers\n",
"1716 total newcomers with edit records in the dataset\n"
]
}
],
"source": [
"print(\"{0} total unique newcomers\".format(len(set([x for x in newcomers.keys()]))))\n",
"print(\"{0} total newcomers with edit records in the dataset\".format(\n",
" len(newcomer_revisions.keys())))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#list(newcomers.values())[-1]['registration']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1716 newcomers in revision set\n",
"0 revisions not in newcomer set\n",
"1 newcomers not in revision set\n"
]
}
],
"source": [
"newcomers_in_revision_set = set()\n",
"revisions_not_in_newcomer_set = set()\n",
"newcomers_not_in_revision_set = set()\n",
"for user_id, revisions in newcomer_revisions.items():\n",
" if user_id in newcomers.keys():\n",
" newcomers_in_revision_set.add(user_id)\n",
" else:\n",
" revisions_not_in_newcomer_set.add(user_id)\n",
"\n",
"for user_id in newcomers.keys():\n",
" if user_id not in newcomer_revisions.keys():\n",
" newcomers_not_in_revision_set.add(key)\n",
" \n",
"print(\"{0} newcomers in revision set\".format(len(newcomers_in_revision_set)))\n",
"print(\"{0} revisions not in newcomer set\".format(len(revisions_not_in_newcomer_set)))\n",
"print(\"{0} newcomers not in revision set\".format(len(newcomers_not_in_revision_set)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Revision Dataframe with Information on users"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"all_revisions = []\n",
"for user_id, revisions in newcomer_revisions.items():\n",
" newcomer = newcomers[user_id]\n",
" newcomer['registration.date'] = datetime.datetime.strptime(\n",
" newcomer['registration'].replace(\"b'\",\"\").replace(\"'\",\"\"), \n",
" \"%Y%m%d%H%M%S\")\n",
" for revision in revisions:\n",
" revision['registration'] = newcomer['registration.date']\n",
" revision['edits.6.months'] = newcomer['edit count']\n",
" all_revisions.append(revision)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(all_revisions).to_csv(os.path.join(\n",
" \"data\", lang+\"_revisions_with_user_11.2017.csv\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create User Dataframe with Summary Stats on Revisions"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"for user_id, newcomer in newcomers.items():\n",
" if user_id in newcomer_revisions:\n",
" revisions = newcomer_revisions[user_id]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#revisions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment