IronistM · March 24, 2014 16:14
diff --git a/redshift_noddy_example.r b/redshift_noddy_example.r
 require(redshift)

 conn <- redshift.connect("jdbc:postgresql://mycluster.redshift.amazonaws.com:5439/data", "user", "pass")

 # we can retrieve a list of tables
 tables <- redshift.tables(conn)

 # and get some info about the columns in one of those tables
 cols <- redshift.columns(conn, "weblog")

 # lets run a simple query, a number of requests (by their status code) grouped by day
 statuses_by_day <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, status, COUNT(1) as request_count",
                                        "FROM weblog",
                                        "GROUP BY DATE(time_stamp), status",
                                        "ORDER BY dated"))
 statuses_by_day$dated <- as.Date(statuses_by_day$dated)
 statuses_by_day$status <- as.factor(statuses_by_day$status)

 # lets do a little scatter plot to see how this looks
 require(ggplot2)
 p <- ggplot(statuses_by_day, aes(x=dated, y=request_count))
 p + geom_point(aes(color=status))

 # lets see if 200 success requests correlate with the number of
 # whatchamajig transactions we make by day
 success.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
                                               "FROM weblog",
                                               "WHERE status = 200",
                                               "GROUP BY DATE(time_stamp)",
                                               "ORDER BY dated ASC"))

 # next, number of transactions by day too
 txns.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as transactions",
                                   "FROM whatchamajig_transactions",
                                   "GROUP BY DATE(time_stamp)",
                                   "ORDER BY dated ASC"))

 traffic.txns.byday <- merge(success.requests.byday, txns.byday, by="dated")
 traffic.txns.byday$dated <- as.Date(traffic.txns.byday$dated)

 # quick plot to see whether it looks realistic?
 traffic.txns.p <- ggplot(traffic.txns.byday, aes(x=requests, y=transactions))
 traffic.txns.p + geom_point()

 # lets use cor.test to test and measure significance
 cor.test(traffic.txns.byday$requests, traffic.txns.byday$transactions, alternative="greater")

 # interesting, but we may have influenced the data by only including successful
 # requests, redirects and errors may also be interesting

 # ok, so how about correlation to all requests
 requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
                                       "FROM weblog",
                                       "GROUP BY DATE(time_stamp)",
                                       "ORDER BY dated ASC"))

 all.traffic.txns <- merge(requests.byday, txns.byday, by="dated")
 all.traffic.txns$dated <- as.Date(all.traffic.txns$dated)

 all.traffic.txns.p <- ggplot(all.traffic.txns, aes(x=requests, y=transactions))
 all.traffic.txns.p + geom_point()

 cor.test(all.traffic.txns$requests, all.traffic.txns$transactions, alternative="greater")

 # what about traffic to _only_ a specific section... is that stronger?
 whatchamajig.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
                                                    "FROM weblog",
                                                    "WHERE request_path LIKE '%whatchamajig%'",
                                                    "GROUP BY DATE(time_stamp)",
                                                    "ORDER BY dated ASC"))
 whatchamajig.requests.txns <- merge(whatchamajig.requests.byday, txns.byday, by="dated")
 whatchamajig.requests.txns$dated <- as.Date(whatchamajig.requests.txns$dated)

 # lets look at the plot
 whatchamajig.requests.txns.p <- ggplot(whatchamajig.requests.txns, aes(x=requests, y=transactions))
 whatchamajig.requests.txns.p + geom_point()

 # and one last final correlation test :)
 cor.test(whatchamajig.requests.txns$requests, whatchamajig.requests.txns$transactions, alternative="greater")
	require(redshift)

	conn <- redshift.connect("jdbc:postgresql://mycluster.redshift.amazonaws.com:5439/data", "user", "pass")

	# we can retrieve a list of tables
	tables <- redshift.tables(conn)

	# and get some info about the columns in one of those tables
	cols <- redshift.columns(conn, "weblog")

	# lets run a simple query, a number of requests (by their status code) grouped by day
	statuses_by_day <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, status, COUNT(1) as request_count",
	"FROM weblog",
	"GROUP BY DATE(time_stamp), status",
	"ORDER BY dated"))
	statuses_by_day$dated <- as.Date(statuses_by_day$dated)
	statuses_by_day$status <- as.factor(statuses_by_day$status)

	# lets do a little scatter plot to see how this looks
	require(ggplot2)
	p <- ggplot(statuses_by_day, aes(x=dated, y=request_count))
	p + geom_point(aes(color=status))

	# lets see if 200 success requests correlate with the number of
	# whatchamajig transactions we make by day
	success.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
	"FROM weblog",
	"WHERE status = 200",
	"GROUP BY DATE(time_stamp)",
	"ORDER BY dated ASC"))

	# next, number of transactions by day too
	txns.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as transactions",
	"FROM whatchamajig_transactions",
	"GROUP BY DATE(time_stamp)",
	"ORDER BY dated ASC"))

	traffic.txns.byday <- merge(success.requests.byday, txns.byday, by="dated")
	traffic.txns.byday$dated <- as.Date(traffic.txns.byday$dated)

	# quick plot to see whether it looks realistic?
	traffic.txns.p <- ggplot(traffic.txns.byday, aes(x=requests, y=transactions))
	traffic.txns.p + geom_point()

	# lets use cor.test to test and measure significance
	cor.test(traffic.txns.byday$requests, traffic.txns.byday$transactions, alternative="greater")

	# interesting, but we may have influenced the data by only including successful
	# requests, redirects and errors may also be interesting

	# ok, so how about correlation to all requests
	requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
	"FROM weblog",
	"GROUP BY DATE(time_stamp)",
	"ORDER BY dated ASC"))

	all.traffic.txns <- merge(requests.byday, txns.byday, by="dated")
	all.traffic.txns$dated <- as.Date(all.traffic.txns$dated)

	all.traffic.txns.p <- ggplot(all.traffic.txns, aes(x=requests, y=transactions))
	all.traffic.txns.p + geom_point()

	cor.test(all.traffic.txns$requests, all.traffic.txns$transactions, alternative="greater")

	# what about traffic to _only_ a specific section... is that stronger?
	whatchamajig.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
	"FROM weblog",
	"WHERE request_path LIKE '%whatchamajig%'",
	"GROUP BY DATE(time_stamp)",
	"ORDER BY dated ASC"))
	whatchamajig.requests.txns <- merge(whatchamajig.requests.byday, txns.byday, by="dated")
	whatchamajig.requests.txns$dated <- as.Date(whatchamajig.requests.txns$dated)

	# lets look at the plot
	whatchamajig.requests.txns.p <- ggplot(whatchamajig.requests.txns, aes(x=requests, y=transactions))
	whatchamajig.requests.txns.p + geom_point()

	# and one last final correlation test :)
	cor.test(whatchamajig.requests.txns$requests, whatchamajig.requests.txns$transactions, alternative="greater")