Skip to content

Instantly share code, notes, and snippets.

@chernyshev-alex
Last active September 6, 2018 11:54
Show Gist options
  • Save chernyshev-alex/5968f074f06a641789d540c6fb2bb6dc to your computer and use it in GitHub Desktop.
Save chernyshev-alex/5968f074f06a641789d540c6fb2bb6dc to your computer and use it in GitHub Desktop.
generate Ad log entries for GU tasks
import sys
import json
import random
from datetime import datetime, timedelta
#########################
# Ad logs json generator
#
# Help : python botgen.py -h
#
# Examples :
# Write log for 1 bot, 1000 users, 100 requestes/sec, duration 300 seconds
#
# python botgen.py -b 1 -u 1000 -n 100 -d 300 -f data.json
#
# Notes :
# bots have ip 172.20.X.X and make a transition ~ 1 in sec
# users have ip 172.10.X.X and make a transition ~ 4 in sec
#
# == generate content ids for bots and users
# content ids [1000 .. 1020]
bot_categories = [id for id in range(1000, 1020)]
# bot changes content twice as much as an user
# conten ids [1000, 1000 .. 1010, 1010]
user_categories = bot_categories[:int(len(bot_categories)/2)]*2
# these funtions return random content id for users, bots
def random_content_user(): return random.choice(user_categories)
def random_content_bot(): return random.choice(bot_categories)
# generate random action for users, bots
# bots clicks more often that users
def random_action_user(): return random.choice(['click', 'view', 'view', 'view']) # probabilities click/view = 25/75
def random_action_bot(): return random.choice(['click', 'click', 'click', 'view']) # probabilities click/view = 75/25
def user2ip(id): return "172.10.{}.{}".format(int(id / 255), id % 255)
def bot2ip(id): return "172.20.{}.{}".format(int(id / 255), id % 255)
def asits(dt): return int(dt.timestamp())
def asJson(entry): return { 'unix_time' : asits(entry[0]), 'category_id': entry[1], 'ip' : entry[2], 'type' : entry[3] }
def writeAsJson(entry, fd = None):
if fd:
json.dump(asJson(entry), fd)
else:
print(entry)
# Log generator for users & bots
def generate_log(args, start_time):
BOT_TRANSITION_EVERY_SEC = 2
t1, t2 = start_time, start_time + timedelta(seconds = args.duration)
users = range(0, args.users)
while t1 < t2:
for uid in random.sample(users, args.freq):
yield (t1, random_content_user(), user2ip(uid), random_action_user())
if (int(t1.timestamp()) % BOT_TRANSITION_EVERY_SEC ==0):
for bid in range(0, args.bots):
yield (t1, random_content_bot(), bot2ip(bid), random_action_bot())
t1 += timedelta(seconds = 1)
print("generated for period :", start_time, t2)
def do_generate(fd = None):
first = True
for entry in generate_log(args, datetime.now()):
if not first and fd:
fd.write(",\n")
else:
first = False
writeAsJson(entry, fd)
def main(args):
print("started with parameters :", args)
if args.file:
with open(args.file, 'w') as fd:
fd.write("[")
do_generate(fd)
fd.write("]")
else:
do_generate()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bots', type=int, default=1, help="number of bots")
parser.add_argument('-u', '--users', type=int, default=1000, help="number of users")
parser.add_argument('-d', '--duration', type=int, default=300, help="log duration in sec")
parser.add_argument('-n', '--freq', type=int, default=100, help="number of user's requests in sec")
parser.add_argument('-f', '--file', type=str, default=None, help="write to file")
args = parser.parse_args()
main(args)
@chernyshev-alex
Copy link
Author

chernyshev-alex commented May 29, 2018

[{"unix_time": 1528106331, "category_id": 1007, "ip": "172.10.1.35", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.173", "type": "click"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.3.43", "type": "click"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.130", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.22", "type": "click"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.2.4", "type": "view"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.125", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.253", "type": "view"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.1.59", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.61", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.1.14", "type": "view"},
{"unix_time": 1528106331, "category_id": 1001, "ip": "172.10.1.26", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.52", "type": "view"},
{"unix_time": 1528106331, "category_id": 1005, "ip": "172.10.3.120", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.135", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.0.117", "type": "view"},
{"unix_time": 1528106331, "category_id": 1000, "ip": "172.10.0.209", "type": "view"},
{"unix_time": 1528106331, "category_id": 1004, "ip": "172.10.3.182", "type": "view"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.2.223", "type": "click"},
{"unix_time": 1528106331, "category_id": 1004, "ip": "172.10.1.25", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.0.112", "type": "view"},
{"unix_time": 1528106331, "category_id": 1008, "ip": "172.10.2.163", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.177", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.113", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.1.210", "type": "click"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.3.75", "type": "view"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.0.61", "type": "click"},
{"unix_time": 1528106331, "category_id": 1009, "ip": "172.10.1.190", "type": "view"},
{"unix_time": 1528106331, "category_id": 1003, "ip": "172.10.0.182", "type": "view"},
{"unix_time": 1528106331, "category_id": 1002, "ip": "172.10.3.119", "type": "view"},
{"unix_time": 1528106331, "category_id": 1000, "ip": "172.10.0.108", "type": "view"},
{"unix_time": 1528106331, "category_id": 1006, "ip": "172.10.1.17", "type": "click"},
{"unix_time": 1528106331, "category_id": 1008, "ip": "172.10.0.11", "type": "click"}]

@pborshchenko
Copy link

Please update example comment to use python3, I got error for python.

@zbstof
Copy link

zbstof commented Sep 6, 2018

Task description states:
Data formats

All data is supplied in form of (multiple) files that got dumped on filesystem, each event is JSON, each JSON on its own line, with above mentioned fields [...] (emphasis mine)

But this script generates single json per file, with json objects are per-line. I think script should be modified to reflect requirements.
Related: http://jsonlines.org/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment