Skip to content

Instantly share code, notes, and snippets.

@greeness
Created June 9, 2014 04:20
Show Gist options
  • Save greeness/610ec4e6de91a6939b24 to your computer and use it in GitHub Desktop.
Save greeness/610ec4e6de91a6939b24 to your computer and use it in GitHub Desktop.
step 2.
import json
from datetime import datetime
from copy import deepcopy
from random import random
def append_features_from_user_block(user_block, user_row):
user_row["num_turns"] = len(user_block)
user_row["min_balance"] = min([x["balance"] for x in user_block])
user_row["max_balance"] = max([x["balance"] for x in user_block])
user_row["min_level"] = min([x["level"] for x in user_block])
user_row["max_level"] = max([x["level"] for x in user_block])
user_row["delta_level"] = user_row["max_level"] - user_row["min_level"]
user_row["num_days"] = len(set([x["timestamp"].date() for x in user_block]))
user_row["num_machine_types"] = len(set([x["machine"] for x in user_block]))
def write_arff_header(users, arff):
arff.write('@RELATION scopely_user_turns\n\n')
arff.write('@ATTRIBUTE gender {M, W, ALL, UNKNOWN}\n')
arff.write('@ATTRIBUTE device {Phone, Tablet}\n')
arff.write('@ATTRIBUTE platform {i, a}\n')
arff.write('@ATTRIBUTE age_range {13-24, 25-35, 36-52, 53-64}\n')
#arff.write('@ATTRIBUTE install_recency NUMERIC\n')
arff.write('@ATTRIBUTE delta_level NUMERIC\n')
arff.write('@ATTRIBUTE min_level NUMERIC\n')
arff.write('@ATTRIBUTE max_level NUMERIC\n')
#arff.write('@ATTRIBUTE min_balance NUMERIC\n')
#arff.write('@ATTRIBUTE max_balance NUMERIC\n')
arff.write('@ATTRIBUTE num_played_days NUMERIC\n')
arff.write('@ATTRIBUTE num_machine_types NUMERIC\n')
arff.write('@ATTRIBUTE num_turns NUMERIC\n')
arff.write('@ATTRIBUTE is_payer {0 1}\n\n')
arff.write('@DATA\n')
def write_arff_data(user_row, arff):
v = user_row
line = []
line.append(v["gender"])
line.append(v["device"])
line.append(v["platform"])
line.append(v["age"])
#line.append(str(v["install"]))
line.append(str(v["delta_level"]))
line.append(str(v["min_level"]))
line.append(str(v["max_level"]))
#line.append(str(v["min_balance"]))
#line.append(str(v["max_balance"]))
line.append(str(v["num_days"]))
line.append(str(v["num_machine_types"]))
line.append(str(v["num_turns"]))
line.append(str(v["is_payer"]))
arff.write(','.join(line)+'\n')
users = json.load(open('users.json'))
print 'number of unique users:', len(users)
last_user_id= ''
user_block = []
total_users = 83434
num_blocks = 0
down_sampling_of_negative = True
if down_sampling_of_negative:
arff = open('users_downsample.arff', 'w')
else:
arff = open('users_all.arff', 'w')
write_arff_header(users, arff)
for line in open('turns_uniq.csv'):
try:
user_id, timestamp, machine, level, balance = line.strip().split(',')
timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
level = int(level)
balance = int(balance)
if last_user_id != '' and user_id != last_user_id:
if users.has_key(last_user_id):
user_row = deepcopy(users[last_user_id])
should_include = False
if user_row["is_payer"] == 1:
should_include = True
else:
if not down_sampling_of_negative or \
(down_sampling_of_negative and random() < 0.05):
should_include = True
if should_include:
append_features_from_user_block(user_block, user_row)
write_arff_data(user_row, arff)
user_block = []
num_blocks += 1
if num_blocks % 1000 == 0:
print "finished users ", num_blocks, " (", num_blocks * 100.0 / total_users, '%)'
user_block.append({"timestamp": timestamp,
"level": level,
"balance": balance,
"machine": machine,
"id": user_id})
last_user_id = user_id
except:
print line
pass
# process the last block
if user_block and users.has_key(last_user_id):
append_features_from_user_block(user_block, users[last_user_id])
write_arff_data(user_row, arff)
arff.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment