Created
November 28, 2018 06:37
-
-
Save shiumachi/34cb6745e446a0a38ebc703ea9a44d74 to your computer and use it in GitHub Desktop.
data generator for Hive / Impala demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import random | |
usage = """\ | |
%(prog)s [options] | |
""" | |
def init_parser(): | |
parser = argparse.ArgumentParser(prog='datagen.py', usage=usage) | |
parser.add_argument("-f", "--filenum", type=int, default=0, | |
help="No. of file. default:0. file will be written to /tmp/[filenum].txt") | |
parser.add_argument("-l", "--linenum", type=int, default=10, help="No. of line. default:10") | |
parser.add_argument("-t", "--type", default='A', help="table type. A or B") | |
parser.add_argument("-r", "--randomize", action='store_true', help="randomize linenum") | |
return parser | |
def get_site_id(num): | |
return num ** 3 % 73 | |
def get_domain(num, maxnum=2887): | |
n = num ** 3 % maxnum | |
if n < 300: | |
return "" | |
elif n < 500: | |
return "mypage.com" | |
else: | |
return "%03d.com" % n | |
def generate_func_A(num): | |
""" | |
ri_page_url | |
STRING | |
ri _refferer_url | |
STRING | |
ri_site_id | |
INT | |
""" | |
ri_page_url = '/%03d.html' % (num ** 3 % 757) | |
ri_refferer_url = '%s' % get_domain(num) | |
ri_site_id = get_site_id(num) | |
return "%s,%s,%d" % (ri_page_url, ri_refferer_url, ri_site_id) | |
def generate_func_B(num): | |
""" | |
su_site_id | |
INT | |
su_url | |
STRING | |
""" | |
su_site_id = num | |
if num == 1: | |
su_url = "mypage.com" | |
else: | |
su_url = '%03d.com' % (num - 1) | |
return "%d,%s" % (su_site_id, su_url) | |
def generate_func_C(num): | |
""" | |
id | |
INT | |
org_id | |
INT | |
name | |
STRING | |
""" | |
id = num | |
org_id = num ** 3 % 757 | |
name = "user{0}".format(num) | |
return "{0},{1},{2}".format(id, org_id, name) | |
generate_func_dict = {"A": generate_func_A, | |
"B": generate_func_B, | |
"C": generate_func_C | |
} | |
def main(): | |
parser = init_parser() | |
args = parser.parse_args() | |
filenum = args.filenum | |
start = args.filenum * (10 ** 9) | |
if args.randomize is True: | |
linenum = int(args.linenum * random.random()) | |
else: | |
linenum = args.linenum | |
type = args.type | |
if type not in generate_func_dict: | |
type = 'A' | |
generate_func = generate_func_dict[type] | |
with open('/tmp/%03d.txt' % int(filenum), 'w') as f: | |
for i in xrange(start, start + linenum): | |
f.write(generate_func(i) + '\n') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment