Last active
June 21, 2018 16:52
-
-
Save ottomata/6b0af13856383acb55214c5ddaef80e3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See also: https://phabricator.wikimedia.org/T184479#4168727 | |
# $ pyspark2 --master yarn | |
# Raw JSON event data is imported into paths like this. | |
# This will select all data for June. | |
data_path = "/wmf/data/raw/event/eqiad_mediawiki_revision-score/hourly/2018/06/*/*" | |
# The raw form is a Hadoop sequence file. Read the JSON string out of the tuple. | |
json_lines = sc.sequenceFile(data_path).map(lambda x: x[1]) | |
# Parse the JSON strings into a Spark DataFrame. | |
revision_score_events = spark.read.json(json_lines) | |
import pyspark.sql.functions.explode as explode | |
# Select the revision ids that are predicted to be goodfaith edits. | |
goodfaith_scores = revision_score_events.select( | |
'rev_id', | |
# We need to use pyspark.sql.functions.explode in order to pull the | |
# scores array into a struct object to make it easier to query. | |
# If this were a better (Hive compatible) schema, we'd be able to just do something like | |
# WHERE scores.goodfaith IS NOT NULL and scores.goodfaith.prediction = 'true' | |
sql_functions.explode(revision_score_events.scores).alias("scores") | |
).where("scores.model_name = 'goodfaith' and scores.prediction = 'true'") | |
# Now that we've got all of the goodfaith rev_ids for June, let's join | |
# it with revision-score events and find the users who made the most | |
# goodfaith edits in June so far. | |
# We can use spark.table because revision-create events are Hive compatible, | |
# and we can import them into Hive. If revision-scores were Hive/SQL compatible, | |
# we'd be able to all of this in just SQL, with no coding necessary. | |
revisions = spark.table("event.mediawiki_revision_create").where("year=2018 and month=6") | |
# Join the goodfaith rev_ids with the revisions. | |
goodfaith_revisions = revisions.join(goodfaith_scores, goodfaith_scores.rev_id == revisions.rev_id) | |
# How many good faith revisions were there in July? | |
goodfaith_revisions.count() | |
5776154 | |
# ^ was run on June 21, so not a full month. | |
# Group the goodfaith revisions by user_text and count and sort descending. | |
goodfaith_user_revision_count = goodfaith_revisions.groupBy("performer.user_text").count().orderBy("count", ascending=False) | |
goodfaith_user_edit_counts = goodfaith_user_revision_count.orderBy("count", ascending=False) | |
# .show() prints the first few results. | |
goodfaith_user_edit_counts.show() | |
+------------------+------+ | |
| user_text| count| | |
+------------------+------+ | |
| Ghuron|191642| | |
| Benoît Prieur|142668| | |
| Harmonia Amanda|111326| | |
|InternetArchiveBot|108756| | |
| Beat Estermann|105101| | |
| &beer&love| 89816| | |
| Liridon| 77822| | |
| Renamerr| 71272| | |
| JarBot| 59597| | |
| Vojtěch Dostál| 59158| | |
| Jason.nlw| 52214| | |
| Thierry Caro| 48220| | |
| Nvrandow| 43955| | |
| Olaf Kosinsky| 43912| | |
| Edoderoo| 41092| | |
| Sic19| 34654| | |
| KolbertBot| 32324| | |
| Richard Nevell| 32080| | |
| Dexbot| 27787| | |
| BOT-Superzerocool| 27538| | |
+------------------+------+ | |
only showing top 20 rows | |
# This is only joining revision-score events with revision-create events. | |
# If we keep historical revision-scores for a long time, and/or | |
# rescore and reimport all scores for all revision history, queryies | |
# like this can be joined against the mediawiki_history table, | |
# which will allow for complex analysis historical analysis. | |
# We could even incorporate the ORES revision scores into the | |
# mediawiki_history table itself, storing every score alongside of | |
# evern revision. | |
# | |
# We eventually plan to have the revision text content in Hadoop as | |
# well, which will allow us to join the scores with the actual | |
# revision diffs. | |
# If revision-score was Hive/SQL compatible, we could do all of the above in a single | |
# SQL query, something like: | |
""" | |
SELECT | |
revision_create.user_text, count(*) as cnt | |
FROM | |
revision_score, revision_create | |
WHERE | |
revision_score.goodfaith.prediction = 'true' AND | |
revision_score.rev_id = revision_create.rev_id | |
GROUP BY revision_create.user_text | |
ORDER BY cnt DESC | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment