Last active
August 29, 2015 14:23
-
-
Save quinnj/9a5dd7598e34b2bec310 to your computer and use it in GitHub Desktop.
Managing Data in Julia: Old Tricks, New Tricks Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Pkg.clone("https://github.com/quinnj/Mmap.jl") | |
Pkg.clone("https://github.com/quinnj/CSV.jl") | |
Pkg.add("ODBC") | |
Pkg.checkout("SQLite","jq/remodel") | |
Pkg.add("SQLite") | |
Pkg.checkout("SQLite","jq/updates") | |
reload("Mmap") | |
reload("CSV") | |
reload("ODBC") | |
reload("SQLite") | |
### Mmap.jl | |
# Let's create a toy file | |
t = tempname() | |
open(t,"w") do f | |
write(f,"Hello World\n") | |
end | |
# mmap it | |
m = Mmap.mmap(t) | |
# Make a string out of it | |
ASCIIString(m) | |
# Can modify | |
m[5] = 0x00 | |
# See the modification | |
ASCIIString(m) | |
# Sync the changes to disk | |
Mmap.sync!(m) | |
# Verify changes from disk | |
open(readall,t) | |
# Let's make an mmapped-array linked to a new file | |
t2 = tempname() | |
m2 = Mmap.mmap(t2,Float64,100) | |
# put some data in our mmapped-array | |
copy!(m2,randn(100)) | |
m2 | |
# Sync changes to disk | |
Mmap.sync!(m2) | |
# close it | |
m2 = nothing; gc() | |
# Read the array back into Julia | |
Mmap.mmap(t2,Float64) | |
# Can also get chunks of memory | |
m3 = Mmap.mmap(Uint8,100) | |
### CSV.jl | |
# Pretty straightforward | |
DIR = joinpath(Pkg.dir("CSV"),"test/test_files") | |
readdir(DIR) | |
f = CSV.File(joinpath(DIR,"baseball.csv")) | |
data = CSV.read(f) | |
f = CSV.File(joinpath(DIR,"Sacramentorealestatetransactions.csv");newline='\r') | |
CSV.read(f) | |
f = CSV.File("/Users/jacobquinn/Downloads/bids.csv") | |
@time data = CSV.read(f) # 12.2 seconds | |
@time readcsv("/Users/jacobquinn/Downloads/bids.csv") | |
# 42.13 seconds | |
### R | |
# > system.time(fread("/Users/jacobquinn/Downloads/bids.csv")) | |
# Read 7656334 rows and 9 (of 9) columns from 0.862 GB file in 00:00:34 | |
# user system elapsed | |
# 32.829 0.662 33.506 | |
### with require(bit64) | |
# > system.time(fread("/Users/jacobquinn/Downloads/bids.csv")) | |
# Read 7656334 rows and 9 (of 9) columns from 0.862 GB file in 00:00:19 | |
# user system elapsed | |
# 18.424 0.569 19.002 | |
### Pandas | |
# start_time = time.time() | |
# pandas.read_csv("/Users/jacobquinn/Downloads/bids.csv") | |
# print("--- %s seconds ---" % (time.time() - start_time)) | |
# [7656334 rows x 9 columns] | |
# >>> print("--- %s seconds ---" % (time.time() - start_time)) | |
# --- 11.8574368954 seconds --- | |
# >>> | |
### ODBC | |
reload("ODBC") | |
co = ODBC.advancedconnect("Driver={MySQL Unicode};Server=ensembldb.ensembl.org;User=anonymous") | |
ODBC.query(co,"use homo_sapiens_vega_69_37") | |
ODBC.query(co,"select count(*) from exon") | |
ODBC.query(co,"show columns from exon") | |
ODBC.query(co,"select phase from exon group by phase") | |
ODBC.query(co,"select count(*) from exon where phase = 0") | |
# SQLite | |
db = SQLite.DB(joinpath(Pkg.dir("SQLite"),"test/Chinook_Sqlite.sqlite")) | |
results = SQLite.query(db,"SELECT name FROM sqlite_master WHERE type='table';") | |
results1 = SQLite.tables(db) | |
results = SQLite.query(db,"SELECT * FROM Employee;") | |
SQLite.query(db,"SELECT * FROM Album;") | |
SQLite.query(db,"SELECT a.*, b.AlbumId | |
FROM Artist a | |
LEFT OUTER JOIN Album b ON b.ArtistId = a.ArtistId | |
ORDER BY name;") | |
r = SQLite.query(db,"create table temp as select * from album") | |
r = SQLite.query(db,"select * from temp limit 10") | |
SQLite.query(db,"alter table temp add column colyear int") | |
SQLite.query(db,"update temp set colyear = 2014") | |
r = SQLite.query(db,"select * from temp limit 10") | |
SQLite.query(db,"alter table temp add column dates blob") | |
stmt = SQLite.Stmt(db,"update temp set dates = ?") | |
SQLite.bind!(stmt,1,Date(2014,1,1)) | |
SQLite.execute!(stmt) | |
r = SQLite.query(db,"select * from temp limit 10") | |
finalize(stmt) | |
SQLite.query(db,"drop table temp") | |
SQLite.create(db,"temp",zeros(5,5)) | |
r = SQLite.query(db,"select * from temp") | |
SQLite.drop!(db,"temp") | |
SQLite.create(db,"temp",zeros(Int,5,5)) | |
r = SQLite.query(db,"select * from temp") | |
SQLite.append!(db,"temp",ones(Int,5,5)) | |
r = SQLite.query(db,"select * from temp") | |
SQLite.drop!(db,"temp") | |
rng = Date(2013):Date(2013,1,5) | |
SQLite.create(db,"temp",[i for i = rng, j = rng]) | |
r = SQLite.query(db,"select * from temp") | |
SQLite.drop!(db,"temp") | |
triple(x) = 3x | |
SQLite.register(db, triple, nargs=1) | |
r = SQLite.query(db, "SELECT Total FROM Invoice ORDER BY InvoiceId LIMIT 5") | |
s = SQLite.query(db, "SELECT triple(Total) FROM Invoice ORDER BY InvoiceId LIMIT 5") | |
[3r[1] s[1]] | |
SQLite.@register db function add4(q) | |
q+4 | |
end | |
r = SQLite.query(db, "SELECT add4(AlbumId) FROM Album") | |
s = SQLite.query(db, "SELECT AlbumId FROM Album") | |
[r[1] s[1]+4] | |
SQLite.@register db mult(args...) = *(args...) | |
r = SQLite.query(db, "SELECT Milliseconds, Bytes FROM Track") | |
s = SQLite.query(db, "SELECT mult(Milliseconds, Bytes) FROM Track") | |
t = SQLite.query(db, "SELECT mult(Milliseconds, Bytes, 3, 4) FROM Track") | |
SQLite.@register db sin | |
u = SQLite.query(db, "select sin(milliseconds) from track limit 5") | |
SQLite.register(db, hypot; nargs=2, name="hypotenuse") | |
v = SQLite.query(db, "select hypotenuse(Milliseconds,bytes) from track limit 5") | |
SQLite.@register db str2arr(s) = convert(Array{UInt8}, s) | |
r = SQLite.query(db, "SELECT str2arr(LastName) FROM Employee LIMIT 2") | |
SQLite.@register db big | |
r = SQLite.query(db, "SELECT big(5)") | |
@assert r[1][1] == big(5) | |
doublesum_step(persist, current) = persist + current | |
doublesum_final(persist) = 2 * persist | |
SQLite.register(db, 0, doublesum_step, doublesum_final, name="doublesum") | |
r = SQLite.query(db, "SELECT doublesum(UnitPrice) FROM Track") | |
r = SQLite.query(db, "SELECT doublesum(Total) FROM Invoice") | |
s = SQLite.query(db, "SELECT UnitPrice FROM Track") | |
Base.Test.@test_approx_eq r[1][1] 2*sum(s[1]) | |
mycount(p, c) = p + 1 | |
SQLite.register(db, 0, mycount) | |
r = SQLite.query(db, "SELECT mycount(TrackId) FROM PlaylistTrack") | |
s = SQLite.query(db, "SELECT count(TrackId) FROM PlaylistTrack") | |
bigsum(p, c) = p + big(c) | |
SQLite.register(db, big(0), bigsum) | |
r = SQLite.query(db, "SELECT bigsum(TrackId) FROM PlaylistTrack") | |
s = SQLite.query(db, "SELECT TrackId FROM PlaylistTrack") | |
@assert r[1][1] == big(sum(s[1])) | |
@time SQLite.create(db,CSV.File("/Users/jacobquinn/Downloads/bids.csv"),"temp") | |
@time SQLite.append!(db,"temp",CSV.File("/Users/jacobquinn/Downloads/bids.csv")) | |
SQLite.scalarquery(db,"select count(*) from temp") | |
@time SQLite.query(db,"select country from temp group by 1") | |
SQLite.createindex(db,"temp","ind","country";unique=false) | |
@time SQLite.query(db,"select country from temp group by 1") | |
SQLite.drop!(db,"temp") | |
# Tables.jl | |
Pkg.clone("https://github.com/quinnj/Tables.jl") | |
using Tables | |
t = Tables.Table(Int,10,10) | |
size(t) == (10,10) | |
typeof(t) === Tables.Table | |
Tables.columns(t) | |
Tables.rows(t) | |
Tables.types(t) | |
t[1,1] = 1 | |
t | |
t[1,1:10] = 2 | |
for i = 1:10 | |
Base.Test.@test t[1,i] == 2 | |
end | |
t2 = t[1,1:10] | |
t3 = t[1:10,1] | |
t4 = t[wh"Column2 > 0"] | |
t5 = t[wh"Column2 > 0",1:2] | |
unique(t3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment