Created
November 28, 2018 06:47
-
-
Save shiumachi/e92737de20ec1d236becb38b9b9e9e36 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import random | |
from datetime import date, timedelta | |
from random import shuffle | |
# option settings | |
parser = argparse.ArgumentParser(description='retail data generator') | |
parser.add_argument('--no-file', type=int, default=1, help='number of files. default is 1.') | |
parser.add_argument('--no-line', type=int, default=10000, help='number of lines. default is 10000.') | |
parser.add_argument('--no-date', type=int, default=365, | |
help='number of dates start from 2015-01-01 in YYYY-MM-DD format. default is 365.') | |
parser.add_argument('--no-product', type=int, default=100, help='no-product: number of products. default is 100.') | |
parser.add_argument('--no-product-category', type=int, default=3, help='number of product categories. default is 3.') | |
parser.add_argument('--no-store', type=int, default=10, help='number of stores. default is 10.') | |
parser.add_argument('--no-city', type=int, default=3, help='number of cities of stores. default is 3.') | |
parser.add_argument('--no-sales_quantity', type=int, default=107, help='number of sales quantities. default is 107.') | |
parser.add_argument('--no-unit_price', type=int, default=None, | |
help='number of unit prices. default is (no_product) * 2.') | |
parser.add_argument('--always-same-result', action='store_true', | |
help='if you need always same result, please set this option. random data is generated by default.') | |
def generate_rand_tables(p): | |
rand_tables = {} | |
if p.always_same_result is True: | |
random.seed(0) | |
else: | |
random.seed() | |
# sales table | |
sales_quantities = [x for x in range(1, p.no_sales_quantity + 1)] | |
shuffle(sales_quantities) | |
rand_tables['sales_quantities'] = sales_quantities | |
unit_prices = [x for x in range(1, p.no_unit_price + 1)] | |
shuffle(unit_prices) | |
rand_tables['unit_prices'] = unit_prices | |
# date table | |
dates = [x for x in range(1, p.no_date + 1)] | |
shuffle(dates) | |
rand_tables['dates'] = dates | |
# product table | |
products = [x for x in range(1, p.no_product + 1)] | |
shuffle(products) | |
rand_tables['products'] = products | |
product_categories = [x for x in range(1, p.no_product_category + 1)] | |
shuffle(product_categories) | |
rand_tables['product_categories'] = product_categories | |
# store table | |
stores = [x for x in range(1, p.no_store + 1)] | |
shuffle(stores) | |
rand_tables['stores'] = stores | |
cities = [x for x in range(1, p.no_city + 1)] | |
shuffle(cities) | |
rand_tables['cities'] = cities | |
return rand_tables | |
def gen_date_dim(p): | |
d0 = date(2015, 1, 1) | |
f = open("dates.csv", "w") | |
for i in range(p.no_date): | |
d = d0 + timedelta(days=i) | |
f.write("{0},{1}\n".format(str(i + 1), d.isoformat())) | |
f.close() | |
def gen_product_dim(p, rand_tables): | |
f = open("products.csv", "w") | |
for i in range(p.no_product): | |
id = str(i + 1) | |
name = "item" + str(i + 1) | |
category_no = (i + 1) % p.no_product_category | |
category = "category" + str(category_no) | |
f.write("{0},{1},{2}\n".format(id, name, category)) | |
f.close() | |
def gen_store_dim(p, rand_tables): | |
f = open("stores.csv", "w") | |
for i in range(p.no_store): | |
id = str(i + 1) | |
name = "store" + str(i + 1) | |
city_no = (i + 1) % p.no_city | |
city = "city" + str(city_no) | |
f.write("{0},{1},{2}\n".format(id, name, city)) | |
f.close() | |
def gen_sales(p, rand_tables): | |
for file_no in range(1, p.no_file + 1): | |
gen_sales_per_file(p, rand_tables, file_no) | |
def gen_sales_per_file(p, rand_tables, file_no): | |
start_line = (p.no_line * file_no) + 1 | |
end_line = p.no_line * (file_no + 1) | |
f = open("sales{0}.csv".format(file_no), "w") | |
for line in range(start_line, end_line + 1): | |
date_id = rand_tables['dates'][line % (p.no_date)] | |
product_id = rand_tables['products'][line % (p.no_product)] | |
store_id = rand_tables['stores'][line % (p.no_store)] | |
sales_quantity = rand_tables['sales_quantities'][line % (p.no_sales_quantity)] | |
unit_price = rand_tables['unit_prices'][line % (p.no_unit_price)] | |
sales_amount = sales_quantity * unit_price | |
row = ','.join(list(map(str, [date_id, product_id, store_id, sales_quantity, unit_price, sales_amount]))) | |
f.write(row + "\n") | |
f.close() | |
if __name__ == '__main__': | |
p = parser.parse_args() | |
if p.no_unit_price is None: | |
p.no_unit_price = p.no_product * 2 | |
rand_tables = generate_rand_tables(p) | |
gen_date_dim(p) | |
gen_product_dim(p, rand_tables) | |
gen_store_dim(p, rand_tables) | |
gen_sales(p, rand_tables) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment