Skip to content

Instantly share code, notes, and snippets.

@blisst
Last active September 12, 2019 01:36
Show Gist options
  • Save blisst/ac798ec9563615fcd5bf9b10dca0e245 to your computer and use it in GitHub Desktop.
Save blisst/ac798ec9563615fcd5bf9b10dca0e245 to your computer and use it in GitHub Desktop.
Ryan Johnson - Moloco String Equals when One Char Removed
def equalsWhenOneCharRemoved(x, y):
# Can't be different in length by more than 1
if (abs(len(x) - len(y)) > 1):
return False
# Iterate through X and Y together
diffFound = 0;
lenX = len(x)
lenY = len(y)
for i in range(max(lenX, lenY)):
# If we have reached the end of either string, break
if (i >= lenX - 1 or i >= lenY -1):
break
# If they are different at an index, we add a difference
if (x[i] != y[i]):
diffFound += 1
# If there is more than one difference, they are not similar enough
if (diffFound > 1):
return False
# If we get here, check the number of differences is less than 1
return diffFound <= 1
import json
def rankProducts(filePath):
"""
Loop through each line of file at filePath, parsing as JSON for each line, and loading it into a dictionary.
Structure of dictionary 'products' looks like:
{
product_id_1: {
user_id_1: quantity_for_user_id_1
user_id_2: quantity_for_user_id_2
user_id_3: quantity_for_user_id_3
}
product_id_2: {
user_id_1: quantity_for_user_id_1
user_id_2: quantity_for_user_id_2
user_id_3: quantity_for_user_id_3
}
}
"""
products = {}
# Iterate through each line
for line in open(filePath):
lineJson = json.loads(line.rstrip('\n'))
productId = lineJson["product_id"]
userId = lineJson["user_id"]
quantity = lineJson["quantity"]
# Add product to dict if not in it already
if productId not in products:
products[productId] = {}
# Add user_id to the product if not there already
if userId not in products[productId]:
products[productId][userId] = 0
# Add the quantity of that product for that user
products[productId][userId] += quantity
# Go through each product_id and check which has sold the most quantity and which had the most unique purchasers, allowing for ties
mostSoldMax = 0
mostUsersMax = 0
# Lists so we can have multiple entries for ties
mostSold = []
mostUniqueUsers = []
for product_id in products.keys():
# Number of unique users for a product_id is the number of user_ids for that product
numUsers = len(products[product_id].keys())
# The total sold is the sum of the quantities over all the users for a given product
productSum = sum(products[product_id].values())
# Compute the maximum total quantity sold, filling a list with element product_ids that are tied
if productSum > mostSoldMax:
mostSold = [product_id]
mostSoldMax = productSum
elif productSum == mostSoldMax:
mostSold.append(product_id)
# Compute the product with the most unique users, filling a list with element product_ids that are tied
if numUsers > mostUsersMax:
mostUniqueUsers = [product_id]
mostUsersMax = numUsers
elif numUsers == mostUsersMax:
mostUniqueUsers.append(product_id)
print("Most popular product(s) based on the number of purchasers: " + ", ".join(mostUniqueUsers))
print("Most popular product(s) based on the quantity of goods sold: " + ", ".join(mostSold))
# If we didn't have to support ties, we could use the following:
# mostSold = max(products, key=lambda productId:sum(products[productId].values()))
# mostUniqueUsers = max(products, key=lambda productId:len(products[productId].keys()))
-- LARGEST NUMBER OF UNIQUE USERS FOR COUNTRY BDV
SELECT *, COUNT(user_id) as counts FROM timeseries
WHERE country_id="BDV"
GROUP BY site_id, user_id
ORDER BY counts DESC;
-- 4 users who visited sites more than 10 times
SELECT user_id, site_id, COUNT(user_id) as user_vists FROM timeseries
WHERE ts > '2019-02-03 00:00:00' AND ts < '2019-02-04 23:59:59'
GROUP BY site_id, user_id
HAVING COUNT(user_id) > 10
ORDER BY user_vists DESC;
-- Which site was the most last-visited by users
SELECT a.*, COUNT(*)
FROM timeseries a
INNER JOIN
(
SELECT user_id, MAX(ts) maxTime
FROM timeseries
GROUP BY user_id
) b on a.user_id = b.user_id AND
a.ts = maxTime
GROUP BY site_id
ORDER BY COUNT(*) DESC;
-- HOW MANY user's first/last visits were to the same site
SELECT COUNT(*), mnQ.user_id, mxQ.site_id last_site, mnQ.site_id first_site FROM
(
SELECT a.*
FROM timeseries a
INNER JOIN
(
SELECT user_id, MAX(ts) maxTime
FROM timeseries
GROUP BY user_id
) b on a.user_id = b.user_id AND
a.ts = maxTime
) mxQ
JOIN
(
SELECT c.*
FROM timeseries c
INNER JOIN
(
SELECT user_id, MIN(ts) minTime
FROM timeseries
GROUP BY user_id
) d on c.user_id = d.user_id AND
c.ts = minTime
) mnQ
WHERE mxQ.user_id = mnQ.user_id and mxQ.site_id = mnQ.site_id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment