Created
April 12, 2017 23:11
-
-
Save daroczig/d33aa44cdfa50bad972f90e3ea69a1ea to your computer and use it in GitHub Desktop.
overlap join demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# demo transactions data | |
library(data.table) | |
txns <- data.table( | |
rpid = rep(1:3, times = 4), | |
txid = 1:12, | |
time = c(10, 10, 10, 11, 15, 20, 12, 16, 25, 13, 21, 30)) | |
## overlap join to see which transactions happened withing 3 time units on the same rpid | |
## let's define the time periods for the overlap | |
txns[, start := time - 3] | |
## rpid txid time start | |
## 1: 1 1 10 7 | |
## 2: 2 2 10 7 | |
## 3: 3 3 10 7 | |
## 4: 1 4 11 8 | |
## 5: 2 5 15 12 | |
## 6: 3 6 20 17 | |
## 7: 1 7 12 9 | |
## 8: 2 8 16 13 | |
## 9: 3 9 25 22 | |
## 10: 1 10 13 10 | |
## 11: 2 11 21 18 | |
## 12: 3 12 30 27 | |
## and a reference dataset (simple copy of the orignal data) | |
txns2 <- copy(txns) | |
txns2[, start := time] | |
## rpid txid time start | |
## 1: 1 1 10 10 | |
## 2: 2 2 10 10 | |
## 3: 3 3 10 10 | |
## 4: 1 4 11 11 | |
## 5: 2 5 15 15 | |
## 6: 3 6 20 20 | |
## 7: 1 7 12 12 | |
## 8: 2 8 16 16 | |
## 9: 3 9 25 25 | |
## 10: 1 10 13 13 | |
## 11: 2 11 21 21 | |
## 12: 3 12 30 30 | |
## then let's set the keys for the future (overlap) join | |
setkey(txns, rpid, start, time) | |
setkey(txns2, rpid, start, time) | |
## doing the actual overlap join resulting in 19 rows: | |
## txid shows which transaction is within 3 time units of i.txid | |
foverlaps(txns, txns2) | |
## rpid txid time start i.txid i.time i.start | |
## 1: 1 1 10 10 1 10 7 | |
## 2: 1 1 10 10 4 11 8 | |
## 3: 1 4 11 11 4 11 8 | |
## 4: 1 1 10 10 7 12 9 | |
## 5: 1 4 11 11 7 12 9 | |
## 6: 1 7 12 12 7 12 9 | |
## 7: 1 1 10 10 10 13 10 | |
## 8: 1 4 11 11 10 13 10 | |
## 9: 1 7 12 12 10 13 10 | |
## 10: 1 10 13 13 10 13 10 | |
## 11: 2 2 10 10 2 10 7 | |
## 12: 2 5 15 15 5 15 12 | |
## 13: 2 5 15 15 8 16 13 | |
## 14: 2 8 16 16 8 16 13 | |
## 15: 2 11 21 21 11 21 18 | |
## 16: 3 3 10 10 3 10 7 | |
## 17: 3 6 20 20 6 20 17 | |
## 18: 3 9 25 25 9 25 22 | |
## 19: 3 12 30 30 12 30 27 | |
## count the number of transactions within the past 3 time units based on the above | |
foverlaps(txns, txns2)[, .N, by = .(txid = i.txid)][order(txid)] | |
## txid N | |
## 1: 1 1 | |
## 2: 2 1 | |
## 3: 3 1 | |
## 4: 4 2 | |
## 5: 5 1 | |
## 6: 6 1 | |
## 7: 7 3 | |
## 8: 8 2 | |
## 9: 9 1 | |
## 10: 10 4 | |
## 11: 11 1 | |
## 12: 12 1 | |
## merge this back to the original data | |
merge( | |
txns, | |
foverlaps(txns, txns2)[, .N, by = .(txid = i.txid)], | |
by = 'txid') | |
## txid rpid time start N | |
## 1: 1 1 10 7 1 | |
## 2: 2 2 10 7 1 | |
## 3: 3 3 10 7 1 | |
## 4: 4 1 11 8 2 | |
## 5: 5 2 15 12 1 | |
## 6: 6 3 20 17 1 | |
## 7: 7 1 12 9 3 | |
## 8: 8 2 16 13 2 | |
## 9: 9 3 25 22 1 | |
## 10: 10 1 13 10 4 | |
## 11: 11 2 21 18 1 | |
## 12: 12 3 30 27 1 | |
## the "N" column above shows the number of transactions within 3 time units | |
## (including the current transaction, so you might want N-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment