blisst · September 12, 2019 01:36
diff --git a/equalsWhenOneCharRemoved.py b/equalsWhenOneCharRemoved.py
 def equalsWhenOneCharRemoved(x, y):

    # Can't be different in length by more than 1
    if (abs(len(x) - len(y)) > 1):
        return False


    # Iterate through X and Y together
    diffFound = 0;
    lenX = len(x)
    lenY = len(y)

    for i in range(max(lenX, lenY)):
        # If we have reached the end of either string, break
        if (i >= lenX - 1 or i >= lenY -1):
            break

        # If they are different at an index, we add a difference
        if (x[i] != y[i]):
            diffFound += 1
        
        # If there is more than one difference, they are not similar enough
        if (diffFound > 1):
            return False

    # If we get here, check the number of differences is less than 1  
    return diffFound <= 1
diff --git a/mostPopularProduct.py b/mostPopularProduct.py
 import json

 def rankProducts(filePath):
    """
    Loop through each line of file at filePath, parsing as JSON for each line, and loading it into a dictionary.

    Structure of dictionary 'products' looks like:
    
    {
        product_id_1: {
            user_id_1: quantity_for_user_id_1
            user_id_2: quantity_for_user_id_2
            user_id_3: quantity_for_user_id_3
        }
         product_id_2: {
            user_id_1: quantity_for_user_id_1
            user_id_2: quantity_for_user_id_2
            user_id_3: quantity_for_user_id_3
        }
    }

    """

    products        = {}

    # Iterate through each line
    for line in open(filePath):
        lineJson    = json.loads(line.rstrip('\n'))
        productId   = lineJson["product_id"]
        userId      = lineJson["user_id"]
        quantity    = lineJson["quantity"]

        # Add product to dict if not in it already
        if productId not in products:
            products[productId] = {}

        # Add user_id to the product if not there already
        if userId not in products[productId]:
            products[productId][userId] = 0

        # Add the quantity of that product for that user
        products[productId][userId] += quantity


    # Go through each product_id and check which has sold the most quantity and which had the most unique purchasers, allowing for ties
    mostSoldMax = 0
    mostUsersMax = 0

    # Lists so we can have multiple entries for ties
    mostSold = []
    mostUniqueUsers = []

    for product_id in products.keys():
        # Number of unique users for a product_id is the number of user_ids for that product
        numUsers   = len(products[product_id].keys())

        # The total sold is the sum of the quantities over all the users for a given product
        productSum = sum(products[product_id].values())

        # Compute the maximum total quantity sold, filling a list with element product_ids that are tied
        if productSum > mostSoldMax:
            mostSold = [product_id]
            mostSoldMax = productSum
        elif productSum == mostSoldMax:
            mostSold.append(product_id)

        # Compute the product with the most unique users, filling a list with element product_ids that are tied
        if numUsers > mostUsersMax:
            mostUniqueUsers = [product_id]
            mostUsersMax    = numUsers

        elif numUsers == mostUsersMax:
            mostUniqueUsers.append(product_id)


    print("Most popular product(s) based on the number of purchasers: " + ", ".join(mostUniqueUsers))
    print("Most popular product(s) based on the quantity of goods sold: " + ", ".join(mostSold))

    # If we didn't have to support ties, we could use the following:
    # mostSold = max(products, key=lambda productId:sum(products[productId].values()))
    # mostUniqueUsers = max(products, key=lambda productId:len(products[productId].keys()))
    
diff --git a/siteAnalytics.sql b/siteAnalytics.sql
 -- LARGEST NUMBER OF UNIQUE USERS FOR COUNTRY BDV

 SELECT *, COUNT(user_id) as counts FROM timeseries
 WHERE country_id="BDV"
 GROUP BY site_id, user_id
 ORDER BY counts DESC;  


 -- 4 users who visited sites more than 10 times

 SELECT user_id, site_id, COUNT(user_id) as user_vists FROM timeseries
 WHERE ts > '2019-02-03 00:00:00' AND ts < '2019-02-04 23:59:59'
 GROUP BY site_id, user_id
 HAVING COUNT(user_id) > 10
 ORDER BY user_vists DESC;


 -- Which site was the most last-visited by users

 SELECT  a.*, COUNT(*)
 FROM    timeseries a
 INNER JOIN
 (
    SELECT user_id, MAX(ts) maxTime
    FROM    timeseries
    GROUP BY user_id
 ) b on a.user_id = b.user_id AND
        a.ts = maxTime
 GROUP BY site_id
 ORDER BY COUNT(*) DESC;

 -- HOW MANY user's first/last visits were to the same site

 SELECT COUNT(*), mnQ.user_id, mxQ.site_id last_site, mnQ.site_id first_site FROM 
 (
    SELECT  a.*
    FROM    timeseries a
        INNER JOIN
        (
            SELECT user_id, MAX(ts) maxTime
            FROM    timeseries
            GROUP BY user_id
        ) b on a.user_id = b.user_id AND
                a.ts = maxTime
 ) mxQ
 JOIN 
 (
    SELECT  c.*
    FROM    timeseries c
        INNER JOIN
        (
            SELECT user_id, MIN(ts) minTime
            FROM    timeseries
            GROUP BY user_id
        ) d on c.user_id = d.user_id AND
                c.ts = minTime
 ) mnQ
 WHERE mxQ.user_id = mnQ.user_id and mxQ.site_id = mnQ.site_id;
	def equalsWhenOneCharRemoved(x, y):

	# Can't be different in length by more than 1
	if (abs(len(x) - len(y)) > 1):
	return False


	# Iterate through X and Y together
	diffFound = 0;
	lenX = len(x)
	lenY = len(y)

	for i in range(max(lenX, lenY)):
	# If we have reached the end of either string, break
	if (i >= lenX - 1 or i >= lenY -1):
	break

	# If they are different at an index, we add a difference
	if (x[i] != y[i]):
	diffFound += 1

	# If there is more than one difference, they are not similar enough
	if (diffFound > 1):
	return False

	# If we get here, check the number of differences is less than 1
	return diffFound <= 1
	import json

	def rankProducts(filePath):
	"""
	Loop through each line of file at filePath, parsing as JSON for each line, and loading it into a dictionary.

	Structure of dictionary 'products' looks like:

	{
	product_id_1: {
	user_id_1: quantity_for_user_id_1
	user_id_2: quantity_for_user_id_2
	user_id_3: quantity_for_user_id_3
	}
	product_id_2: {
	user_id_1: quantity_for_user_id_1
	user_id_2: quantity_for_user_id_2
	user_id_3: quantity_for_user_id_3
	}
	}

	"""

	products = {}

	# Iterate through each line
	for line in open(filePath):
	lineJson = json.loads(line.rstrip('\n'))
	productId = lineJson["product_id"]
	userId = lineJson["user_id"]
	quantity = lineJson["quantity"]

	# Add product to dict if not in it already
	if productId not in products:
	products[productId] = {}

	# Add user_id to the product if not there already
	if userId not in products[productId]:
	products[productId][userId] = 0

	# Add the quantity of that product for that user
	products[productId][userId] += quantity


	# Go through each product_id and check which has sold the most quantity and which had the most unique purchasers, allowing for ties
	mostSoldMax = 0
	mostUsersMax = 0

	# Lists so we can have multiple entries for ties
	mostSold = []
	mostUniqueUsers = []

	for product_id in products.keys():
	# Number of unique users for a product_id is the number of user_ids for that product
	numUsers = len(products[product_id].keys())

	# The total sold is the sum of the quantities over all the users for a given product
	productSum = sum(products[product_id].values())

	# Compute the maximum total quantity sold, filling a list with element product_ids that are tied
	if productSum > mostSoldMax:
	mostSold = [product_id]
	mostSoldMax = productSum
	elif productSum == mostSoldMax:
	mostSold.append(product_id)

	# Compute the product with the most unique users, filling a list with element product_ids that are tied
	if numUsers > mostUsersMax:
	mostUniqueUsers = [product_id]
	mostUsersMax = numUsers

	elif numUsers == mostUsersMax:
	mostUniqueUsers.append(product_id)


	print("Most popular product(s) based on the number of purchasers: " + ", ".join(mostUniqueUsers))
	print("Most popular product(s) based on the quantity of goods sold: " + ", ".join(mostSold))

	# If we didn't have to support ties, we could use the following:
	# mostSold = max(products, key=lambda productId:sum(products[productId].values()))
	# mostUniqueUsers = max(products, key=lambda productId:len(products[productId].keys()))
	-- LARGEST NUMBER OF UNIQUE USERS FOR COUNTRY BDV

	SELECT *, COUNT(user_id) as counts FROM timeseries
	WHERE country_id="BDV"
	GROUP BY site_id, user_id
	ORDER BY counts DESC;


	-- 4 users who visited sites more than 10 times

	SELECT user_id, site_id, COUNT(user_id) as user_vists FROM timeseries
	WHERE ts > '2019-02-03 00:00:00' AND ts < '2019-02-04 23:59:59'
	GROUP BY site_id, user_id
	HAVING COUNT(user_id) > 10
	ORDER BY user_vists DESC;


	-- Which site was the most last-visited by users

	SELECT a., COUNT()
	FROM timeseries a
	INNER JOIN
	(
	SELECT user_id, MAX(ts) maxTime
	FROM timeseries
	GROUP BY user_id
	) b on a.user_id = b.user_id AND
	a.ts = maxTime
	GROUP BY site_id
	ORDER BY COUNT(*) DESC;

	-- HOW MANY user's first/last visits were to the same site

	SELECT COUNT(*), mnQ.user_id, mxQ.site_id last_site, mnQ.site_id first_site FROM
	(
	SELECT a.*
	FROM timeseries a
	INNER JOIN
	(
	SELECT user_id, MAX(ts) maxTime
	FROM timeseries
	GROUP BY user_id
	) b on a.user_id = b.user_id AND
	a.ts = maxTime
	) mxQ
	JOIN
	(
	SELECT c.*
	FROM timeseries c
	INNER JOIN
	(
	SELECT user_id, MIN(ts) minTime
	FROM timeseries
	GROUP BY user_id
	) d on c.user_id = d.user_id AND
	c.ts = minTime
	) mnQ
	WHERE mxQ.user_id = mnQ.user_id and mxQ.site_id = mnQ.site_id;