Last active
August 29, 2015 14:06
-
-
Save pawelmhm/e17d80663a51c2c756f7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import print_function | |
from optparse import OptionParser | |
import sys | |
from collections import OrderedDict | |
import warnings | |
warnings.filterwarnings("ignore") | |
import pandas as pd | |
from prettytable import PrettyTable | |
def get_discovery_stats(): | |
""" | |
Filename should be csv file with results of discovery job. | |
Returns following data about a job: | |
1) value count for shelves - how many items are in which shelf | |
2) amount of unique product urls | |
""" | |
usage = "\n\n > python compare_category.py test_urls/items_diy.com_discovery_19.csv\n" | |
parser = OptionParser(usage) | |
parser.add_option("-f", "--file", action="store", dest="job_data_file", | |
help="write output of script to file") | |
parser.add_option("-l", "--limit", dest="limit", type="int", | |
help="limit output of categories") | |
parser.add_option("-t", "--trim_urls", action="store_true") | |
(options, args) = parser.parse_args() | |
try: | |
filename = args[0] | |
except IndexError: | |
parser.error("You should pass file to script") | |
try: | |
job_data = pd.read_csv(filename) | |
except IOError as e: | |
parser.error("File does not exist!") | |
if options.limit is None: | |
limit = 10 | |
else: | |
limit = options.limit | |
unique_urls = len(job_data.url_product_page.unique()) | |
message = "\n Filename {0}\n ".format(filename) | |
message += "\n Unique product urls in file: {0}\n ".format(unique_urls) | |
message += "\n Total urls: {0} \n".format(len(job_data.url_product_page)) | |
message += "\n Now showing {0} biggest categories\n".format(limit) | |
category_counts = job_data.category.value_counts().to_dict() | |
od = OrderedDict(sorted(category_counts.items(), | |
key=lambda t: t[1], reverse=True)) | |
pt = PrettyTable(field_names=["category", "items", "url"]) | |
pt.align["category"] = "l" | |
pt.align["url"] = "r" | |
i = 0 | |
for line in od: | |
i += 1 | |
category_url = job_data.url[job_data.category.str.contains(line)].unique()[0] | |
if options.trim_urls: | |
if len(category_url) > 100: | |
category_url = category_url[:100] | |
pt.add_row([line, category_counts[line], category_url]) | |
if i >= limit: | |
break | |
if options.job_data_file: | |
with open(options.job_data_file, "w") as f: | |
print(message, file=f) | |
print(pt, file=f) | |
else: | |
print(message) | |
print(pt) | |
if __name__ == "__main__": | |
get_discovery_stats() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment