pawelmhm · August 29, 2015 14:06
diff --git a/discovery_items_data.py b/discovery_items_data.py
 #!/usr/bin/python

 from __future__ import print_function
 from optparse import OptionParser
 import sys
 from collections import OrderedDict
 import warnings
 warnings.filterwarnings("ignore")

 import pandas as pd
 from prettytable import PrettyTable

 def get_discovery_stats():
    """
    Filename should be csv file with results of discovery job.

    Returns following data about a job:
        1) value count for shelves - how many items are in which shelf
        2) amount of unique product urls

    """
    usage = "\n\n > python compare_category.py test_urls/items_diy.com_discovery_19.csv\n"
    parser = OptionParser(usage)
    parser.add_option("-f", "--file", action="store", dest="job_data_file",
                    help="write output of script to file")
    parser.add_option("-l", "--limit", dest="limit", type="int",
                     help="limit output of categories")
    parser.add_option("-t", "--trim_urls", action="store_true")
    (options, args) = parser.parse_args()
    try:
        filename = args[0]
    except IndexError:
        parser.error("You should pass file to script")

    try:
        job_data = pd.read_csv(filename)
    except IOError as e:
        parser.error("File does not exist!")

    if options.limit is None:
        limit = 10
    else:
        limit = options.limit

    unique_urls = len(job_data.url_product_page.unique())
    message = "\n Filename {0}\n ".format(filename)
    message += "\n Unique product urls in file: {0}\n ".format(unique_urls)
    message += "\n Total urls: {0} \n".format(len(job_data.url_product_page))
    message += "\n Now showing {0} biggest categories\n".format(limit)
    category_counts = job_data.category.value_counts().to_dict()
    od = OrderedDict(sorted(category_counts.items(),
                            key=lambda t: t[1], reverse=True))

    pt = PrettyTable(field_names=["category", "items", "url"])
    pt.align["category"] = "l"
    pt.align["url"] = "r"
    i = 0
    for line in od:
        i += 1
        category_url = job_data.url[job_data.category.str.contains(line)].unique()[0]
        if options.trim_urls:
            if len(category_url) > 100:
                category_url = category_url[:100]
        pt.add_row([line, category_counts[line], category_url])
        if i >= limit:
            break

    if options.job_data_file:
        with open(options.job_data_file, "w") as f:
            print(message, file=f)
            print(pt, file=f)
    else:
        print(message)
        print(pt)

 if __name__ == "__main__":
    get_discovery_stats()
	#!/usr/bin/python

	from __future__ import print_function
	from optparse import OptionParser
	import sys
	from collections import OrderedDict
	import warnings
	warnings.filterwarnings("ignore")

	import pandas as pd
	from prettytable import PrettyTable

	def get_discovery_stats():
	"""
	Filename should be csv file with results of discovery job.

	Returns following data about a job:
	1) value count for shelves - how many items are in which shelf
	2) amount of unique product urls

	"""
	usage = "\n\n > python compare_category.py test_urls/items_diy.com_discovery_19.csv\n"
	parser = OptionParser(usage)
	parser.add_option("-f", "--file", action="store", dest="job_data_file",
	help="write output of script to file")
	parser.add_option("-l", "--limit", dest="limit", type="int",
	help="limit output of categories")
	parser.add_option("-t", "--trim_urls", action="store_true")
	(options, args) = parser.parse_args()
	try:
	filename = args[0]
	except IndexError:
	parser.error("You should pass file to script")

	try:
	job_data = pd.read_csv(filename)
	except IOError as e:
	parser.error("File does not exist!")

	if options.limit is None:
	limit = 10
	else:
	limit = options.limit

	unique_urls = len(job_data.url_product_page.unique())
	message = "\n Filename {0}\n ".format(filename)
	message += "\n Unique product urls in file: {0}\n ".format(unique_urls)
	message += "\n Total urls: {0} \n".format(len(job_data.url_product_page))
	message += "\n Now showing {0} biggest categories\n".format(limit)
	category_counts = job_data.category.value_counts().to_dict()
	od = OrderedDict(sorted(category_counts.items(),
	key=lambda t: t[1], reverse=True))

	pt = PrettyTable(field_names=["category", "items", "url"])
	pt.align["category"] = "l"
	pt.align["url"] = "r"
	i = 0
	for line in od:
	i += 1
	category_url = job_data.url[job_data.category.str.contains(line)].unique()[0]
	if options.trim_urls:
	if len(category_url) > 100:
	category_url = category_url[:100]
	pt.add_row([line, category_counts[line], category_url])
	if i >= limit:
	break

	if options.job_data_file:
	with open(options.job_data_file, "w") as f:
	print(message, file=f)
	print(pt, file=f)
	else:
	print(message)
	print(pt)

	if __name__ == "__main__":
	get_discovery_stats()