Last active
September 6, 2018 11:54
-
-
Save chernyshev-alex/5968f074f06a641789d540c6fb2bb6dc to your computer and use it in GitHub Desktop.
generate Ad log entries for GU tasks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import random | |
from datetime import datetime, timedelta | |
######################### | |
# Ad logs json generator | |
# | |
# Help : python botgen.py -h | |
# | |
# Examples : | |
# Write log for 1 bot, 1000 users, 100 requestes/sec, duration 300 seconds | |
# | |
# python botgen.py -b 1 -u 1000 -n 100 -d 300 -f data.json | |
# | |
# Notes : | |
# bots have ip 172.20.X.X and make a transition ~ 1 in sec | |
# users have ip 172.10.X.X and make a transition ~ 4 in sec | |
# | |
# == generate content ids for bots and users | |
# content ids [1000 .. 1020] | |
bot_categories = [id for id in range(1000, 1020)] | |
# bot changes content twice as much as an user | |
# conten ids [1000, 1000 .. 1010, 1010] | |
user_categories = bot_categories[:int(len(bot_categories)/2)]*2 | |
# these funtions return random content id for users, bots | |
def random_content_user(): return random.choice(user_categories) | |
def random_content_bot(): return random.choice(bot_categories) | |
# generate random action for users, bots | |
# bots clicks more often that users | |
def random_action_user(): return random.choice(['click', 'view', 'view', 'view']) # probabilities click/view = 25/75 | |
def random_action_bot(): return random.choice(['click', 'click', 'click', 'view']) # probabilities click/view = 75/25 | |
def user2ip(id): return "172.10.{}.{}".format(int(id / 255), id % 255) | |
def bot2ip(id): return "172.20.{}.{}".format(int(id / 255), id % 255) | |
def asits(dt): return int(dt.timestamp()) | |
def asJson(entry): return { 'unix_time' : asits(entry[0]), 'category_id': entry[1], 'ip' : entry[2], 'type' : entry[3] } | |
def writeAsJson(entry, fd = None): | |
if fd: | |
json.dump(asJson(entry), fd) | |
else: | |
print(entry) | |
# Log generator for users & bots | |
def generate_log(args, start_time): | |
BOT_TRANSITION_EVERY_SEC = 2 | |
t1, t2 = start_time, start_time + timedelta(seconds = args.duration) | |
users = range(0, args.users) | |
while t1 < t2: | |
for uid in random.sample(users, args.freq): | |
yield (t1, random_content_user(), user2ip(uid), random_action_user()) | |
if (int(t1.timestamp()) % BOT_TRANSITION_EVERY_SEC ==0): | |
for bid in range(0, args.bots): | |
yield (t1, random_content_bot(), bot2ip(bid), random_action_bot()) | |
t1 += timedelta(seconds = 1) | |
print("generated for period :", start_time, t2) | |
def do_generate(fd = None): | |
first = True | |
for entry in generate_log(args, datetime.now()): | |
if not first and fd: | |
fd.write(",\n") | |
else: | |
first = False | |
writeAsJson(entry, fd) | |
def main(args): | |
print("started with parameters :", args) | |
if args.file: | |
with open(args.file, 'w') as fd: | |
fd.write("[") | |
do_generate(fd) | |
fd.write("]") | |
else: | |
do_generate() | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-b', '--bots', type=int, default=1, help="number of bots") | |
parser.add_argument('-u', '--users', type=int, default=1000, help="number of users") | |
parser.add_argument('-d', '--duration', type=int, default=300, help="log duration in sec") | |
parser.add_argument('-n', '--freq', type=int, default=100, help="number of user's requests in sec") | |
parser.add_argument('-f', '--file', type=str, default=None, help="write to file") | |
args = parser.parse_args() | |
main(args) | |
Please update example comment to use python3, I got error for python.
Task description states:
Data formats
All data is supplied in form of (multiple) files that got dumped on filesystem, each event is JSON, each JSON on its own line, with above mentioned fields [...] (emphasis mine)
But this script generates single json per file, with json objects are per-line. I think script should be modified to reflect requirements.
Related: http://jsonlines.org/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
[{"unix_time": 1528106331, "category_id": 1007, "ip": "172.10.1.35", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.173", "type": "click"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.3.43", "type": "click"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.130", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.22", "type": "click"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.2.4", "type": "view"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.125", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.253", "type": "view"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.1.59", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.61", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.1.14", "type": "view"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.26", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.52", "type": "view"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.3.120", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.135", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.0.117", "type": "view"},
{"unix_time": 1528106331, "category_id": 1000, "ip": "172.10.0.209", "type": "view"},
{"unix_time": 1528106331, "category_id": 1004, "ip": "172.10.3.182", "type": "view"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.2.223", "type": "click"},
{"unix_time": 1528106331, "category_id": 1004, "ip": "172.10.1.25", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.0.112", "type": "view"},
{"unix_time": 1528106331, "category_id": 1008, "ip": "172.10.2.163", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.177", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.113", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.1.210", "type": "click"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.3.75", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.61", "type": "click"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.1.190", "type": "view"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.0.182", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.119", "type": "view"},
{"unix_time": 1528106331, "category_id": 1000, "ip": "172.10.0.108", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.17", "type": "click"},
{"unix_time": 1528106331, "category_id": 1008, "ip": "172.10.0.11", "type": "click"}]