Created
June 9, 2014 04:20
-
-
Save greeness/610ec4e6de91a6939b24 to your computer and use it in GitHub Desktop.
step 2.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from datetime import datetime | |
from copy import deepcopy | |
from random import random | |
def append_features_from_user_block(user_block, user_row): | |
user_row["num_turns"] = len(user_block) | |
user_row["min_balance"] = min([x["balance"] for x in user_block]) | |
user_row["max_balance"] = max([x["balance"] for x in user_block]) | |
user_row["min_level"] = min([x["level"] for x in user_block]) | |
user_row["max_level"] = max([x["level"] for x in user_block]) | |
user_row["delta_level"] = user_row["max_level"] - user_row["min_level"] | |
user_row["num_days"] = len(set([x["timestamp"].date() for x in user_block])) | |
user_row["num_machine_types"] = len(set([x["machine"] for x in user_block])) | |
def write_arff_header(users, arff): | |
arff.write('@RELATION scopely_user_turns\n\n') | |
arff.write('@ATTRIBUTE gender {M, W, ALL, UNKNOWN}\n') | |
arff.write('@ATTRIBUTE device {Phone, Tablet}\n') | |
arff.write('@ATTRIBUTE platform {i, a}\n') | |
arff.write('@ATTRIBUTE age_range {13-24, 25-35, 36-52, 53-64}\n') | |
#arff.write('@ATTRIBUTE install_recency NUMERIC\n') | |
arff.write('@ATTRIBUTE delta_level NUMERIC\n') | |
arff.write('@ATTRIBUTE min_level NUMERIC\n') | |
arff.write('@ATTRIBUTE max_level NUMERIC\n') | |
#arff.write('@ATTRIBUTE min_balance NUMERIC\n') | |
#arff.write('@ATTRIBUTE max_balance NUMERIC\n') | |
arff.write('@ATTRIBUTE num_played_days NUMERIC\n') | |
arff.write('@ATTRIBUTE num_machine_types NUMERIC\n') | |
arff.write('@ATTRIBUTE num_turns NUMERIC\n') | |
arff.write('@ATTRIBUTE is_payer {0 1}\n\n') | |
arff.write('@DATA\n') | |
def write_arff_data(user_row, arff): | |
v = user_row | |
line = [] | |
line.append(v["gender"]) | |
line.append(v["device"]) | |
line.append(v["platform"]) | |
line.append(v["age"]) | |
#line.append(str(v["install"])) | |
line.append(str(v["delta_level"])) | |
line.append(str(v["min_level"])) | |
line.append(str(v["max_level"])) | |
#line.append(str(v["min_balance"])) | |
#line.append(str(v["max_balance"])) | |
line.append(str(v["num_days"])) | |
line.append(str(v["num_machine_types"])) | |
line.append(str(v["num_turns"])) | |
line.append(str(v["is_payer"])) | |
arff.write(','.join(line)+'\n') | |
users = json.load(open('users.json')) | |
print 'number of unique users:', len(users) | |
last_user_id= '' | |
user_block = [] | |
total_users = 83434 | |
num_blocks = 0 | |
down_sampling_of_negative = True | |
if down_sampling_of_negative: | |
arff = open('users_downsample.arff', 'w') | |
else: | |
arff = open('users_all.arff', 'w') | |
write_arff_header(users, arff) | |
for line in open('turns_uniq.csv'): | |
try: | |
user_id, timestamp, machine, level, balance = line.strip().split(',') | |
timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S") | |
level = int(level) | |
balance = int(balance) | |
if last_user_id != '' and user_id != last_user_id: | |
if users.has_key(last_user_id): | |
user_row = deepcopy(users[last_user_id]) | |
should_include = False | |
if user_row["is_payer"] == 1: | |
should_include = True | |
else: | |
if not down_sampling_of_negative or \ | |
(down_sampling_of_negative and random() < 0.05): | |
should_include = True | |
if should_include: | |
append_features_from_user_block(user_block, user_row) | |
write_arff_data(user_row, arff) | |
user_block = [] | |
num_blocks += 1 | |
if num_blocks % 1000 == 0: | |
print "finished users ", num_blocks, " (", num_blocks * 100.0 / total_users, '%)' | |
user_block.append({"timestamp": timestamp, | |
"level": level, | |
"balance": balance, | |
"machine": machine, | |
"id": user_id}) | |
last_user_id = user_id | |
except: | |
print line | |
pass | |
# process the last block | |
if user_block and users.has_key(last_user_id): | |
append_features_from_user_block(user_block, users[last_user_id]) | |
write_arff_data(user_row, arff) | |
arff.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment