Last active
July 20, 2024 04:20
-
-
Save hovissimo/b267bb3509c3620cca8f1e5a693e45ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
# Hello hello! | |
# | |
# This is meant to be a quick (and fun?) sense-check of your Ruby! | |
# | |
# Notes: | |
# 0. How are you today? 👋 | |
# 1. Feel free to download the CSV locally, paste lines of this file into irb, or run this with `ruby requests.rb`! | |
# 2. We're loosely available to field questions on the email thread (or via [email protected]) | |
# 3. Don't overanalyze your implementation/formatting/variable names/etc. do what comes naturally to you | |
# 4. More than anything, thanks for taking the time to work through this! -BN | |
# | |
# | |
require 'uri' | |
require 'net/http' | |
require 'csv' | |
require 'time' | |
# The following requests.csv file is a logfile as it might be generated by a rails app or monitoring tool | |
# Load 'requests.csv' from the filesystem if we already have it | |
csv = nil | |
begin | |
csv = CSV.read('requests.csv') | |
rescue Errno::ENOENT | |
uri = URI('https://chmln-east.s3.amazonaws.com/tmp/recruiting/fullstack/requests.csv') | |
res = Net::HTTP.get_response(uri) | |
csv = CSV.parse(res.body) | |
puts "Raw CSV size: #{res.body.size / 1000 / 1000}MB" | |
end | |
csv.shift | |
#=> ["request id","timestamp","user id","method","path","response time","db time"] | |
csv.sample | |
#=> ["586596", "2029-03-23T03:19:05.967Z", "5d92c5a154558f806feba09d", "PATCH", "/profiles/:id", "22ms", "6ms"] | |
## | |
# 1. what is the average response time? | |
puts '1. rt' | |
# Before we get started, I want to be clear that I'm using Github Copilot because I've become | |
# accustomed to using it every day at work. It's not very good at _authoring_ code, but it's an | |
# excellent code completion tool and it really helps productivity. | |
# The strict mean can be calculated by summing all the response times and dividing by the number of | |
# records, but this will be pretty slow on this data set. Here's a method I keep in my irbrc for | |
# quick and dirty timing tests. | |
def log_elapsed_time(message = nil) | |
start_time = Time.now | |
yield | |
ensure | |
puts "#{message || 'Calculation time:'} #{Time.now - start_time}" | |
end | |
# Copilot suggested `#{csv.map{ |record| record[5].to_i }.sum / csv.size.to_f}ms`, but I have some | |
# concerns here. | |
# 1. Copilot wants to use :to_i to coerce the elapsed time and strip the unit, but what if the | |
# unit isn't consistently in ms? We could easily get an error if the units aren't consistent. | |
puts "Any non-ms duration units?: #{csv.any? { |r| !r[5].end_with?('ms') }}" | |
# I guess not. You win _this_ time, Copilot. | |
# Assuming we have tight control over the log format and we're feeling comfy coupling our log | |
# analysis to that format, we can proceed with the sloppy coercion - but we should acknowledge the | |
# risk. Of course, in real life all of this should be off the shelf code anyway. | |
# 2. Mapping over the list and then calculating the sum is a double loop. C'mon, copilot! | |
# We'll just pass a block to :sum instead. | |
# 3. The :to_f on `csv.size` bugs me. I get why Copilot "wants" it, but it makes a little more | |
# intuitive sense to put that in the sum. This also makes us slightly more resilient to log format | |
# changes, but not enough to really matter - we're still coupled to the log format. | |
log_elapsed_time do | |
puts "Strict mean (slow): #{csv.sum { |record| record[5].to_f } / csv.size}ms" | |
end | |
# Okay, here is where I was going to do some stastical sampling of the data and talk about standard | |
# deviations to show off, but actually the "slow" way is pretty fast so we can save that for when it | |
# becomes a problem. One of the most important lessons I've learned in my career is when to leave | |
# unimportant work undone. | |
## | |
# 2. how many users online? | |
puts '2. online' | |
# I'm assuming that users can only make requests with their own user_id, and that the presence of a | |
# user_id implies that the request was made by that user. This certainly isn't true in all apps! | |
begin_time = Time.parse(csv.first[1]) # No TZ data? That's a little scary. | |
end_time = Time.parse(csv.last[1]) | |
duration = end_time - begin_time | |
log_elapsed_time do | |
puts "#{csv.uniq { |r| r[2] }.size} unique user_ids in #{duration} seconds" | |
end | |
# This takes ~700ms on my machine, that's maybe acceptable. | |
## | |
# 3. what are the ids of the 5 slowest requests? | |
puts '3. slow' | |
# Yeah, there's not a lot of room for optimization here, it's sorting time. It's time for the KISS | |
# principle, when you need more performance it's time for a real analytics solution. | |
log_elapsed_time do | |
puts '5 slowest requests:' | |
csv.sort_by { |r| r[5].to_f } # "response time" | |
.last(5) | |
.reverse | |
.each { |row| puts row.join(', ') } | |
end | |
# Holy cow, request 555173 took 2.1 seconds, but only spent 10ms in the database? WTF is that? | |
# Honestly, this smells like an in-band http request or something else vile. | |
## | |
# 4. who is overusing this system? | |
puts '4. overuse' | |
# Hmm, how do we mean "overusing"? I'll give a few options. | |
# Here's something else I keep in my .irbrc, it's annoying to me that `tally` still doesn't accept a | |
# block. | |
module Enumerable | |
def tally_by(&block) | |
group_by(&block).transform_values(&:count) | |
end | |
end | |
# We're doing more double/tripe loops here and even repeating calculations, we could definitely | |
# optimize this if we needed to. The way I'd do that is to keep a running tally as we walk the CSV | |
# once, but that WILL be more complicated and if this was production code I'd actually advocate | |
# against it unless we know that this is an important bottleneck in some process. | |
# | |
# In my experience, one of the most expensive resources is human brainpower. A simpler | |
# implementation that's easier to understand, validate, and maintain is ALMOST ALWAYS the right | |
# choise. | |
# | |
# It's always tricky to walk the line between "easy" and "fast", and so frequently this comes down | |
# to factors that exist outside of the source code entirely. For example, I'll lean farther into | |
# "easy" and pay more for scaling if the company has (and plans to keep hiring) a lot of very junior | |
# developers. If I have more senior devs around to help, I'll lean more towards "fast" because we | |
# afford more complex solutions. | |
log_elapsed_time do | |
puts 'Top 5 users by request count:' | |
csv.tally_by { |r| r[2] } # "user id" | |
.sort_by { |_k, v| v } | |
.last(5) | |
.reverse | |
.each { |row| puts row.join(', ') } | |
end | |
log_elapsed_time do | |
puts 'Top 5 users by total response time:' | |
requests_by_user = csv.group_by { |r| r[2] } # "user id" | |
total_response_time_by_user = requests_by_user.transform_values { |rows| rows.sum { |r| r[5].to_f } } | |
total_response_time_by_user.sort_by { |_k, v| v }.last(5).reverse.each do |row| | |
puts row.join(', ') | |
end | |
end | |
log_elapsed_time do | |
puts 'Top 5 users by total database time:' | |
requests_by_user = csv.group_by { |r| r[2] } # "user id" | |
total_database_time_by_user = requests_by_user.transform_values { |rows| rows.sum { |r| r[6].to_f } } | |
total_database_time_by_user.sort_by { |_k, v| v }.last(5).reverse.each do |row| | |
puts row.join(', ') | |
end | |
end | |
# Copilot is being a real pal here now, because of the last two it suggested basically 100% of this | |
# last one after I wrote `puts 'Top 5 users by total database time:'` | |
# User "5df8492154558f806feb62c0" showed up in both total reponse and total database top 5! Not | |
# surpising that we'd see someone in both of these, of course, but it could be worth following up to | |
# see what makes them different. | |
## | |
# 5. analyze this data to find a key insight | |
puts '5. key insight' | |
log_elapsed_time do | |
# How much time do we spend in each endpoint? | |
puts 'Endpoints by total response time:' | |
requests_by_endpoint = csv.group_by { |r| "#{r[3]} - #{r[4]}" } # e.g. "GET - /profiles/:id" | |
total_response_time_by_endpoint = requests_by_endpoint.transform_values { |rows| rows.sum { |r| r[5].to_f } } | |
total_response_time_by_endpoint | |
.sort_by { |_k, v| v } | |
.reverse | |
.each do |row| | |
# format the decimal like 34.04 # <-- I used this comment and then wrote `formatted_time = ` | |
# to prompt Copilot to remember float formatting for me. | |
# formatted_time = format('%.2f', row[1]) | |
puts "#{row[0]}, #{format('%.2f', row[1] / (1000.0 * 60 * 60))} hours" | |
end | |
end | |
# So we're spending a lot more time PATCHing profiles than doing anything else. That's not super | |
# suprising, but it's good to know. The fact that we're spending MANY HOURS of resource time in 12 | |
# seconds of wall time is VERY interesting. Clearly this system is operating at scale! | |
# Let's double check my math, because that's pretty surprising. We already know the average respone | |
# time per request is about 26.6ms, and there are 999999 rows in the CSV. | |
# (26.6 * 999999)/(1000*60*60) = 7.4 hours. That matches my other math, so... yep! | |
log_elapsed_time do | |
# How much time do we spend in each endpoint? | |
puts 'Endpoints by database time:' | |
requests_by_endpoint = csv.group_by { |r| "#{r[3]} - #{r[4]}" } # e.g. "GET - /profiles/:id" | |
database_time_by_endpoint = requests_by_endpoint.transform_values { |rows| rows.sum { |r| r[6].to_f } } | |
database_time_by_endpoint | |
.sort_by { |_k, v| v } | |
.reverse | |
.each do |row| | |
puts "#{row[0]}, #{format('%.2f', row[1] / (1000.0 * 60 * 60))} hours" | |
end | |
end | |
# This matches our total response time very well. In fact it matches so well I'm gaining confidence | |
# that we're looking at very synthetic data. ;) | |
# I was hoping I'd find a discrepancy that I could point out as an opportunity for further | |
# investigation. | |
# Let's look at the distribution of response times (I won't bother with database times for this | |
# exercise, I excpect it to be very similar). | |
log_elapsed_time do | |
sorted_response_times = csv.map { |r| r[5].to_f }.sort! | |
puts 'Response time distribution:' | |
puts "min: #{sorted_response_times.first}" | |
puts "p50: #{sorted_response_times[sorted_response_times.size * 50 / 100]}" | |
puts "p75: #{sorted_response_times[sorted_response_times.size * 75 / 100]}" | |
puts "p90: #{sorted_response_times[sorted_response_times.size * 90 / 100]}" | |
puts "p95: #{sorted_response_times[sorted_response_times.size * 95 / 100]}" | |
puts "max: #{sorted_response_times.last}" | |
# Wow, Copilot is doing good work here. I barely wrote this section. I find it interesting that | |
# Copilot basically chose to use fractions for the percentiles, I would have used decimals. In | |
# hindsight, keeping it as ints isn't a terrible idea because I'm indexing after all. Copilot wants | |
# to use simplified fractions though, which requires human interpretation. I changed it to | |
# percentile fractions for self-documentation and clarity. | |
end | |
# WOW! p95 is 69ms and max is 2.1s! That's a pretty wild distribution. It definitely looks like | |
# there's something to look into for our worst case scenario. Let's see a few more. | |
log_elapsed_time do | |
sorted_response_times = csv.map { |r| r[5].to_f }.sort! | |
puts "p96: #{sorted_response_times[sorted_response_times.size * 96 / 100]}" | |
puts "p97: #{sorted_response_times[sorted_response_times.size * 97 / 100]}" | |
puts "p98: #{sorted_response_times[sorted_response_times.size * 98 / 100]}" | |
puts "p99: #{sorted_response_times[sorted_response_times.size * 99 / 100]}" | |
puts "max: #{sorted_response_times.last}" | |
end | |
# Huh, the distribution is prety smooth all the way up to p99 and then kaboom! I'm detecting some | |
# hand-tweaked values, but we can pretend these are pathological requests to look into. :) | |
## | |
# Bonus question: Imagine you received this CSV as a "stream" -- like tailing a log file | |
# and had to maintain a list of the "current top 10 slowest requests" so that we could | |
# read the current top 10 at any time during the stream. | |
# | |
# Said another way, it's like keeping running top 10 of the slowest requests | |
# | |
# 6. what are the slowest 10 requests directly after row number 75121 is processed? | |
# | |
puts '6. top 10' | |
# I was going to make a function to "stream" records, and Copilot suggested the following: | |
# ``` | |
# def stream_csv(csv) | |
# Enumerator.new do |y| | |
# records.each do |record| | |
# y << record | |
# end | |
# end | |
# end | |
# ``` | |
# | |
# Reading this though, I realized that the CSV is already an Enumerable of course. I can just use | |
# `:each`! | |
# I don't usually go for procedural strategies because keeping state tends to breed bugs. I vastly | |
# prefer pure functions and static data structures, but that's simply not possible for this | |
# exercise. | |
def running_10_slowest_requests(rows) | |
top10 = [] | |
rows.each do |row| | |
# puts "top10:" | |
# top10.each_with_index { |r,i| puts "\t#{i}\t#{r.join(', ')}"} | |
# puts "this row: #{row}" | |
# Early cases, just add it and sort our records. | |
if top10.size < 10 | |
top10 << row | |
top10.sort_by! { |r| r[5].to_f } | |
next | |
end | |
# We have 10 records. If the next row we see is slower than any of our 10 (slower than the fastest | |
# one), we can replace the fastest one and sort. | |
least_slowest_response_time = top10.first[5].to_f | |
if row[5].to_f > least_slowest_response_time | |
top10[0] = row | |
top10.sort_by! { |r| r[5].to_f } | |
end | |
end | |
top10.reverse | |
end | |
# running_10_slowest_requests(csv[..12]).each { |row| puts row.join(', ') } | |
running_10_slowest_requests(csv[..75121]).each { |row| puts row.join(', ') } | |
# running_10_slowest_requests(csv).each { |row| puts row.join(', ') } | |
# Copilot was surprisingly unhelpful this time. I wonder if that's because it's a common interview | |
# question? It might be possible to optimize this by being more careful with the index, or by not | |
# bothering to keep the list sorted (so I only walk the list once per new row) but I think it's | |
# actually pretty likely that I could LOSE performance trying to take low level control because the | |
# sort happens in C and is probably pretty damned fast. | |
#=> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment