Created
June 8, 2018 23:51
-
-
Save natematias/5fedbc060c75d2825e7af7a31adb83ff to your computer and use it in GitHub Desktop.
Wikipedia newcomer validation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Merging Newcomer Dataframes with ORES scores for Newcomer Contributions\n", | |
| "June 8, 2018 J. Nathan Matias\n", | |
| "\n", | |
| "Using data sources from http://paws-public.wmflabs.org/paws-public/User:Juliakamin/Querying%20new%20editors%20via%20sql.ipynb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline \n", | |
| "import os, time, datetime, csv, glob, math, datetime, pprint\n", | |
| "from collections import defaultdict\n", | |
| "import matplotlib.pyplot as plt # Matplotlib for plotting\n", | |
| "import pandas as pd\n", | |
| "from dateutil import parser\n", | |
| "lang = 'pt'\n", | |
| "newcomer_file = os.path.join(\"data\", lang+\"_newcomer_list.csv\")\n", | |
| "newcomer_revisions_files = glob.glob(\n", | |
| " os.path.join('data', lang + \"_newcomer_revisions*.csv\"))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "newcomers = {}\n", | |
| "counter = 0\n", | |
| "with open(newcomer_file, \"r\") as f:\n", | |
| " for newcomer in csv.DictReader(f.readlines()):\n", | |
| " ## REMOVE OUT THE PANDAS SEQUENTIAL INDEX\n", | |
| " ## IF IT EXISTS\n", | |
| " if '' in newcomer.keys():\n", | |
| " del newcomer['']\n", | |
| " newcomer['wiki'] = lang\n", | |
| " newcomers[newcomer['user id']] = newcomer\n", | |
| " counter += 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "data/pt_newcomer_revisions.csv\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "newcomer_revisions = defaultdict(list)\n", | |
| "all_ids = set()\n", | |
| "for filename in newcomer_revisions_files:\n", | |
| " with open(filename, \"r\") as f:\n", | |
| " print(filename)\n", | |
| " for revision in csv.DictReader(f.readlines()):\n", | |
| " revision['wiki'] = lang\n", | |
| " revision_id = revision['revision id']\n", | |
| " if('' in revision.keys()):\n", | |
| " del revision['']\n", | |
| " if(revision_id not in all_ids):\n", | |
| " newcomer_revisions[revision['user id']].append(revision)\n", | |
| " all_ids.add(revision_id)\n", | |
| " \n", | |
| "for key, revisions in newcomer_revisions.items():\n", | |
| " newcomer_revisions[key] = sorted(revisions, \n", | |
| " key=lambda x:parser.parse(x['revision time']))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Data Validation\n", | |
| "Here, we confirm that every newcomer has at least one revision\n", | |
| "And that there aren't any revisions that have no newcomer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "12337 total unique newcomers\n", | |
| "1716 total newcomers with edit records in the dataset\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(\"{0} total unique newcomers\".format(len(set([x for x in newcomers.keys()]))))\n", | |
| "print(\"{0} total newcomers with edit records in the dataset\".format(\n", | |
| " len(newcomer_revisions.keys())))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#list(newcomers.values())[-1]['registration']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "1716 newcomers in revision set\n", | |
| "0 revisions not in newcomer set\n", | |
| "1 newcomers not in revision set\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "newcomers_in_revision_set = set()\n", | |
| "revisions_not_in_newcomer_set = set()\n", | |
| "newcomers_not_in_revision_set = set()\n", | |
| "for user_id, revisions in newcomer_revisions.items():\n", | |
| " if user_id in newcomers.keys():\n", | |
| " newcomers_in_revision_set.add(user_id)\n", | |
| " else:\n", | |
| " revisions_not_in_newcomer_set.add(user_id)\n", | |
| "\n", | |
| "for user_id in newcomers.keys():\n", | |
| " if user_id not in newcomer_revisions.keys():\n", | |
| " newcomers_not_in_revision_set.add(key)\n", | |
| " \n", | |
| "print(\"{0} newcomers in revision set\".format(len(newcomers_in_revision_set)))\n", | |
| "print(\"{0} revisions not in newcomer set\".format(len(revisions_not_in_newcomer_set)))\n", | |
| "print(\"{0} newcomers not in revision set\".format(len(newcomers_not_in_revision_set)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Create Revision Dataframe with Information on users" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "all_revisions = []\n", | |
| "for user_id, revisions in newcomer_revisions.items():\n", | |
| " newcomer = newcomers[user_id]\n", | |
| " newcomer['registration.date'] = datetime.datetime.strptime(\n", | |
| " newcomer['registration'].replace(\"b'\",\"\").replace(\"'\",\"\"), \n", | |
| " \"%Y%m%d%H%M%S\")\n", | |
| " for revision in revisions:\n", | |
| " revision['registration'] = newcomer['registration.date']\n", | |
| " revision['edits.6.months'] = newcomer['edit count']\n", | |
| " all_revisions.append(revision)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pd.DataFrame(all_revisions).to_csv(os.path.join(\n", | |
| " \"data\", lang+\"_revisions_with_user_11.2017.csv\"))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Create User Dataframe with Summary Stats on Revisions" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for user_id, newcomer in newcomers.items():\n", | |
| " if user_id in newcomer_revisions:\n", | |
| " revisions = newcomer_revisions[user_id]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#revisions" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.4.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment