-
-
Save IronistM/9743461 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(redshift) | |
conn <- redshift.connect("jdbc:postgresql://mycluster.redshift.amazonaws.com:5439/data", "user", "pass") | |
# we can retrieve a list of tables | |
tables <- redshift.tables(conn) | |
# and get some info about the columns in one of those tables | |
cols <- redshift.columns(conn, "weblog") | |
# lets run a simple query, a number of requests (by their status code) grouped by day | |
statuses_by_day <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, status, COUNT(1) as request_count", | |
"FROM weblog", | |
"GROUP BY DATE(time_stamp), status", | |
"ORDER BY dated")) | |
statuses_by_day$dated <- as.Date(statuses_by_day$dated) | |
statuses_by_day$status <- as.factor(statuses_by_day$status) | |
# lets do a little scatter plot to see how this looks | |
require(ggplot2) | |
p <- ggplot(statuses_by_day, aes(x=dated, y=request_count)) | |
p + geom_point(aes(color=status)) | |
# lets see if 200 success requests correlate with the number of | |
# whatchamajig transactions we make by day | |
success.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests", | |
"FROM weblog", | |
"WHERE status = 200", | |
"GROUP BY DATE(time_stamp)", | |
"ORDER BY dated ASC")) | |
# next, number of transactions by day too | |
txns.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as transactions", | |
"FROM whatchamajig_transactions", | |
"GROUP BY DATE(time_stamp)", | |
"ORDER BY dated ASC")) | |
traffic.txns.byday <- merge(success.requests.byday, txns.byday, by="dated") | |
traffic.txns.byday$dated <- as.Date(traffic.txns.byday$dated) | |
# quick plot to see whether it looks realistic? | |
traffic.txns.p <- ggplot(traffic.txns.byday, aes(x=requests, y=transactions)) | |
traffic.txns.p + geom_point() | |
# lets use cor.test to test and measure significance | |
cor.test(traffic.txns.byday$requests, traffic.txns.byday$transactions, alternative="greater") | |
# interesting, but we may have influenced the data by only including successful | |
# requests, redirects and errors may also be interesting | |
# ok, so how about correlation to all requests | |
requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests", | |
"FROM weblog", | |
"GROUP BY DATE(time_stamp)", | |
"ORDER BY dated ASC")) | |
all.traffic.txns <- merge(requests.byday, txns.byday, by="dated") | |
all.traffic.txns$dated <- as.Date(all.traffic.txns$dated) | |
all.traffic.txns.p <- ggplot(all.traffic.txns, aes(x=requests, y=transactions)) | |
all.traffic.txns.p + geom_point() | |
cor.test(all.traffic.txns$requests, all.traffic.txns$transactions, alternative="greater") | |
# what about traffic to _only_ a specific section... is that stronger? | |
whatchamajig.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests", | |
"FROM weblog", | |
"WHERE request_path LIKE '%whatchamajig%'", | |
"GROUP BY DATE(time_stamp)", | |
"ORDER BY dated ASC")) | |
whatchamajig.requests.txns <- merge(whatchamajig.requests.byday, txns.byday, by="dated") | |
whatchamajig.requests.txns$dated <- as.Date(whatchamajig.requests.txns$dated) | |
# lets look at the plot | |
whatchamajig.requests.txns.p <- ggplot(whatchamajig.requests.txns, aes(x=requests, y=transactions)) | |
whatchamajig.requests.txns.p + geom_point() | |
# and one last final correlation test :) | |
cor.test(whatchamajig.requests.txns$requests, whatchamajig.requests.txns$transactions, alternative="greater") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment