Last active
June 2, 2016 02:58
-
-
Save arfon/53c570d157e5b1ce166d68a87912e0ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'digest' | |
class EventParseError < StandardError; end | |
class EventTransform | |
attr_accessor :actor, :created_at, :raw_event, :id, :org, :other, :payload, :is_public, :repo, :type | |
def initialize(event_json) | |
@raw_event = event_json | |
@other = Hash.new | |
end | |
def process | |
extract_and_set_fields | |
sanitize | |
end | |
# Return the event hash as described in | |
# https://github.com/igrigorik/githubarchive.org/blob/master/bigquery/schema.js | |
def parsed_event | |
{ | |
'actor' => actor, | |
'created_at' => created_at, | |
'id' => id, | |
'org' => org, | |
'other' => other, | |
'payload' => payload, | |
'public' => is_public, | |
'repo' => repo, | |
'type' => type | |
} | |
end | |
# Scrub emails from push events. Could include further logic in future | |
def sanitize | |
scrub_payload_emails if type == 'PushEvent' | |
end | |
# Extract the top-level schema fields from the raw event body and do any | |
# necessary processing of the fields | |
def extract_and_set_fields | |
@type = raw_event['type'] | |
@is_public = raw_event['public'] | |
@payload = raw_event['payload'] | |
@id = raw_event['id'] | |
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L17-L38 | |
@repo = parse_field('repo', %w{id url name}) | |
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L39-L70 | |
@actor = parse_field('actor', %w{id login gravatar_id avatar_url url}) | |
# https://github.com/igrigorik/githubarchive.org/blob/c9ae11426e5bcc30fe15617d009dfc602697ecde/bigquery/schema.js#L71-L102 | |
@org = parse_field('org', %w{id login gravatar_id avatar_url url}) | |
@created_at = parse_created_at | |
end | |
# Extract a field from the raw event body and extract the expected entries | |
# for extraneous entries add them to the 'other' field. | |
def parse_field(field_name, expected_entries) | |
event_field = raw_event.delete(field_name) | |
# Sometimes this is blank (e.g. for anonymous Gists actor is nil) | |
if event_field.nil? | |
return nil | |
end | |
parsed = {} | |
expected_entries.each do |field| | |
parsed[field] = event_field.delete(field) | |
end | |
# Are there extra fields? | |
# If so, throw them into the other key | |
if event_field.keys.any? | |
@other[field_name] = event_field | |
end | |
return parsed | |
end | |
def parse_created_at | |
return Time.parse(raw_event['created_at']).utc.strftime('%Y-%m-%d %T') | |
end | |
def scrub_payload_emails | |
if payload.has_key?('shas') | |
commits = payload['shas'] | |
# Older format PushEvents have commits described as 'shas'. These | |
# have format ['git sha', 'author email', 'commit message', 'author name'] | |
commits.each do |commit| | |
commit[1] = sanitize_email(commit[1]) | |
end | |
elsif payload.has_key?('commits') | |
# Newer PushEvents have a 'commits' key with nested attributes: | |
# "commits": [ | |
# { | |
# "sha": "5636aa2f6f249f22e76b20e5caeb84096b7302ce", | |
# "author" : { | |
# "email": "[email protected]", | |
# "name": "commiter_login" | |
# }, | |
# "message": "Commit message", | |
# "distinct": true, | |
# "url": "API commit URL" | |
# } | |
# ] | |
commits = payload['commits'] | |
commits.each do |commit| | |
commit['author']['email'] = sanitize_email(commit['author']['email']) | |
end | |
else | |
raise EventParseError | |
end | |
end | |
# If the email doesn't look to be valid, let's just create a SHA1 of | |
# the whole thing. This happens when: | |
# - The email is empty (nil) | |
# - The email field is a string that's not a valid email | |
# - The email doesn't include an '@' symbol | |
def sanitize_email(email) | |
if email.nil? || email.strip.length < 3 || !email.include?('@') | |
return Digest::SHA1.hexdigest(email.to_s) | |
else | |
prefix, domain = email.strip.split('@') | |
return "#{Digest::SHA1.hexdigest(prefix)}@#{domain}" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment