Skip to content

Instantly share code, notes, and snippets.

@pawelmhm
Last active August 29, 2015 14:06
Show Gist options
  • Save pawelmhm/e17d80663a51c2c756f7 to your computer and use it in GitHub Desktop.
Save pawelmhm/e17d80663a51c2c756f7 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from __future__ import print_function
from optparse import OptionParser
import sys
from collections import OrderedDict
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from prettytable import PrettyTable
def get_discovery_stats():
"""
Filename should be csv file with results of discovery job.
Returns following data about a job:
1) value count for shelves - how many items are in which shelf
2) amount of unique product urls
"""
usage = "\n\n > python compare_category.py test_urls/items_diy.com_discovery_19.csv\n"
parser = OptionParser(usage)
parser.add_option("-f", "--file", action="store", dest="job_data_file",
help="write output of script to file")
parser.add_option("-l", "--limit", dest="limit", type="int",
help="limit output of categories")
parser.add_option("-t", "--trim_urls", action="store_true")
(options, args) = parser.parse_args()
try:
filename = args[0]
except IndexError:
parser.error("You should pass file to script")
try:
job_data = pd.read_csv(filename)
except IOError as e:
parser.error("File does not exist!")
if options.limit is None:
limit = 10
else:
limit = options.limit
unique_urls = len(job_data.url_product_page.unique())
message = "\n Filename {0}\n ".format(filename)
message += "\n Unique product urls in file: {0}\n ".format(unique_urls)
message += "\n Total urls: {0} \n".format(len(job_data.url_product_page))
message += "\n Now showing {0} biggest categories\n".format(limit)
category_counts = job_data.category.value_counts().to_dict()
od = OrderedDict(sorted(category_counts.items(),
key=lambda t: t[1], reverse=True))
pt = PrettyTable(field_names=["category", "items", "url"])
pt.align["category"] = "l"
pt.align["url"] = "r"
i = 0
for line in od:
i += 1
category_url = job_data.url[job_data.category.str.contains(line)].unique()[0]
if options.trim_urls:
if len(category_url) > 100:
category_url = category_url[:100]
pt.add_row([line, category_counts[line], category_url])
if i >= limit:
break
if options.job_data_file:
with open(options.job_data_file, "w") as f:
print(message, file=f)
print(pt, file=f)
else:
print(message)
print(pt)
if __name__ == "__main__":
get_discovery_stats()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment