ottomata · June 21, 2018 16:52
diff --git a/revision_score_fun.py b/revision_score_fun.py
 # See also: https://phabricator.wikimedia.org/T184479#4168727

 # $ pyspark2 --master yarn 

 # Raw JSON event data is imported into paths like this.
 # This will select all data for June.
 data_path = "/wmf/data/raw/event/eqiad_mediawiki_revision-score/hourly/2018/06/*/*"

 # The raw form is a Hadoop sequence file. Read the JSON string out of the tuple.
 json_lines = sc.sequenceFile(data_path).map(lambda x: x[1])

 # Parse the JSON strings into a Spark DataFrame.
 revision_score_events = spark.read.json(json_lines)

 import pyspark.sql.functions.explode as explode
 # Select the revision ids that are predicted to be goodfaith edits.
 goodfaith_scores = revision_score_events.select(
    'rev_id',
    # We need to use pyspark.sql.functions.explode in order to pull the
    # scores array into a struct object to make it easier to query.
    # If this were a better (Hive compatible) schema, we'd be able to just do something like
    #   WHERE scores.goodfaith IS NOT NULL and scores.goodfaith.prediction = 'true'
    sql_functions.explode(revision_score_events.scores).alias("scores")
 ).where("scores.model_name = 'goodfaith' and scores.prediction = 'true'")

 # Now that we've got all of the goodfaith rev_ids for June, let's join
 # it with revision-score events and find the users who made the most
 # goodfaith edits in June so far.

 # We can use spark.table because revision-create events are Hive compatible,
 # and we can import them into Hive.  If revision-scores were Hive/SQL compatible,
 # we'd be able to all of this in just SQL, with no coding necessary.
 revisions = spark.table("event.mediawiki_revision_create").where("year=2018 and month=6")
 # Join the goodfaith rev_ids with the revisions.
 goodfaith_revisions = revisions.join(goodfaith_scores, goodfaith_scores.rev_id == revisions.rev_id)

 # How many good faith revisions were there in July?
 goodfaith_revisions.count()
 5776154 
 # ^ was run on June 21, so not a full month.

 # Group the goodfaith revisions by user_text and count and sort descending.
 goodfaith_user_revision_count = goodfaith_revisions.groupBy("performer.user_text").count().orderBy("count", ascending=False)
 goodfaith_user_edit_counts = goodfaith_user_revision_count.orderBy("count", ascending=False)

 # .show() prints the first few results.
 goodfaith_user_edit_counts.show()

 +------------------+------+
 |         user_text| count|
 +------------------+------+
 |            Ghuron|191642|
 |     Benoît Prieur|142668|
 |   Harmonia Amanda|111326|
 |InternetArchiveBot|108756|
 |    Beat Estermann|105101|
 |        &beer&love| 89816|
 |           Liridon| 77822|
 |          Renamerr| 71272|
 |            JarBot| 59597|
 |    Vojtěch Dostál| 59158|
 |         Jason.nlw| 52214|
 |      Thierry Caro| 48220|
 |          Nvrandow| 43955|
 |     Olaf Kosinsky| 43912|
 |          Edoderoo| 41092|
 |             Sic19| 34654|
 |        KolbertBot| 32324|
 |    Richard Nevell| 32080|
 |            Dexbot| 27787|
 | BOT-Superzerocool| 27538|
 +------------------+------+
 only showing top 20 rows


 # This is only joining revision-score events with revision-create events.
 # If we keep historical revision-scores for a long time, and/or
 # rescore and reimport all scores for all revision history, queryies
 # like this can be joined against the mediawiki_history table,
 # which will allow for complex analysis historical analysis.
 # We could even incorporate the ORES revision scores into the
 # mediawiki_history table itself, storing every score alongside of
 # evern revision.
 #
 # We eventually plan to have the revision text content in Hadoop as
 # well, which will allow us to join the scores with the actual
 # revision diffs.


 # If revision-score was Hive/SQL compatible, we could do all of the above in a single
 # SQL query, something like:

 """
 SELECT
    revision_create.user_text, count(*) as cnt
 FROM
    revision_score, revision_create
 WHERE
    revision_score.goodfaith.prediction = 'true' AND
    revision_score.rev_id = revision_create.rev_id
 GROUP BY revision_create.user_text
 ORDER BY cnt DESC
 """
	# See also: https://phabricator.wikimedia.org/T184479#4168727

	# $ pyspark2 --master yarn

	# Raw JSON event data is imported into paths like this.
	# This will select all data for June.
	data_path = "/wmf/data/raw/event/eqiad_mediawiki_revision-score/hourly/2018/06//"

	# The raw form is a Hadoop sequence file. Read the JSON string out of the tuple.
	json_lines = sc.sequenceFile(data_path).map(lambda x: x[1])

	# Parse the JSON strings into a Spark DataFrame.
	revision_score_events = spark.read.json(json_lines)

	import pyspark.sql.functions.explode as explode
	# Select the revision ids that are predicted to be goodfaith edits.
	goodfaith_scores = revision_score_events.select(
	'rev_id',
	# We need to use pyspark.sql.functions.explode in order to pull the
	# scores array into a struct object to make it easier to query.
	# If this were a better (Hive compatible) schema, we'd be able to just do something like
	# WHERE scores.goodfaith IS NOT NULL and scores.goodfaith.prediction = 'true'
	sql_functions.explode(revision_score_events.scores).alias("scores")
	).where("scores.model_name = 'goodfaith' and scores.prediction = 'true'")

	# Now that we've got all of the goodfaith rev_ids for June, let's join
	# it with revision-score events and find the users who made the most
	# goodfaith edits in June so far.

	# We can use spark.table because revision-create events are Hive compatible,
	# and we can import them into Hive. If revision-scores were Hive/SQL compatible,
	# we'd be able to all of this in just SQL, with no coding necessary.
	revisions = spark.table("event.mediawiki_revision_create").where("year=2018 and month=6")
	# Join the goodfaith rev_ids with the revisions.
	goodfaith_revisions = revisions.join(goodfaith_scores, goodfaith_scores.rev_id == revisions.rev_id)

	# How many good faith revisions were there in July?
	goodfaith_revisions.count()
	5776154
	# ^ was run on June 21, so not a full month.

	# Group the goodfaith revisions by user_text and count and sort descending.
	goodfaith_user_revision_count = goodfaith_revisions.groupBy("performer.user_text").count().orderBy("count", ascending=False)
	goodfaith_user_edit_counts = goodfaith_user_revision_count.orderBy("count", ascending=False)

	# .show() prints the first few results.
	goodfaith_user_edit_counts.show()

	+------------------+------+
	\| user_text\| count\|
	+------------------+------+
	\| Ghuron\|191642\|
	\| Benoît Prieur\|142668\|
	\| Harmonia Amanda\|111326\|
	\|InternetArchiveBot\|108756\|
	\| Beat Estermann\|105101\|
	\| &beer&love\| 89816\|
	\| Liridon\| 77822\|
	\| Renamerr\| 71272\|
	\| JarBot\| 59597\|
	\| Vojtěch Dostál\| 59158\|
	\| Jason.nlw\| 52214\|
	\| Thierry Caro\| 48220\|
	\| Nvrandow\| 43955\|
	\| Olaf Kosinsky\| 43912\|
	\| Edoderoo\| 41092\|
	\| Sic19\| 34654\|
	\| KolbertBot\| 32324\|
	\| Richard Nevell\| 32080\|
	\| Dexbot\| 27787\|
	\| BOT-Superzerocool\| 27538\|
	+------------------+------+
	only showing top 20 rows


	# This is only joining revision-score events with revision-create events.
	# If we keep historical revision-scores for a long time, and/or
	# rescore and reimport all scores for all revision history, queryies
	# like this can be joined against the mediawiki_history table,
	# which will allow for complex analysis historical analysis.
	# We could even incorporate the ORES revision scores into the
	# mediawiki_history table itself, storing every score alongside of
	# evern revision.
	#
	# We eventually plan to have the revision text content in Hadoop as
	# well, which will allow us to join the scores with the actual
	# revision diffs.


	# If revision-score was Hive/SQL compatible, we could do all of the above in a single
	# SQL query, something like:

	"""
	SELECT
	revision_create.user_text, count(*) as cnt
	FROM
	revision_score, revision_create
	WHERE
	revision_score.goodfaith.prediction = 'true' AND
	revision_score.rev_id = revision_create.rev_id
	GROUP BY revision_create.user_text
	ORDER BY cnt DESC
	"""