Created
March 20, 2015 20:23
-
-
Save jarsen/7174147c4cc3a4638a4b to your computer and use it in GitHub Desktop.
just playing with julia data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Pkg.add("Gadfly") | |
using Gadfly | |
# Random cool meta stuff | |
print(code_typed(print, ())) | |
:(2 + 2) # quote expressions to see AST | |
print(code_lowered(print, ())) # see the lowered AST | |
# returns an array because some functions return multiple values | |
Pkg.add("Cairo") | |
Pkg.add("DataFrames") | |
Pkg.add("RDatasets") | |
using DataFrames | |
using RDatasets | |
iris = dataset("datasets", "iris") | |
# DataFrames deal with missing data smartly | |
v = [0.5, 0.6, 0.7, 0.9] | |
mean(v) | |
# NA Type like Null, poisons other values, like NaN for floating point numbers | |
1 + NA | |
1 > NA | |
typeof(NA) | |
isna(NA) | |
NA == NA | |
NaN == NaN | |
NA == 1 | |
# v = [0.5, 0.6, 0.7, NA, 0.9] # can't do this. use DataArray to store Ts and NAs side by side | |
# dv = DataArray([1,2,3]) | |
# you can add NAs *after* you create the DataArray, not init with | |
dv = @data([NA,1,2,3]) | |
dv[1] = NA | |
join(dv, "::") | |
mean(dv) | |
mean(dropna(dv)) | |
mean(array(dv, 1)) # subsitution all NAs with 1 | |
2dv | |
# convert DataArray into Array | |
convert(Array, dropna(dv)) | |
# a bunch of convenience constuctors coming from matlab heritage | |
DataArray(zeros(Float64, 8)) | |
ones(Float64, 10) | |
falses(10) | |
trues(6) | |
eye(3) # identity matrix | |
diagm(4) # diagonal matrix | |
# use this @data macro to make DataArrays | |
foo = @data(["John Smith", "Jane Doe"]) | |
@data(ones(10)) | |
# MATRIX MATH | |
a = DataArray(eye(10)) | |
a[5,5] = NA | |
a = a*a | |
print(a) | |
print(a.na) # this is a seperate matrix keeping track of NAs | |
# singular value decomposition | |
svd(a) | |
# fast fourier transform... | |
# fft(zeros(10)) | |
# Dealing with Heterogenous Data | |
# (each type of column might be different) | |
# it's like a very simple implementation of a relation database | |
a = [1:5] | |
a[1:3] | |
a[1:end] | |
# a[0:3] 1 indexed! | |
in(10, a) | |
in(2, a) | |
# A and B are our columns | |
df = DataFrame(A = 1:4, B=["M", "F", "F", "M"]) | |
df[:A] | |
size(df[:B]) | |
size(df) | |
(4,2)[1] | |
nrows = size(df, 1) | |
ncols = size(df, 2) | |
head(df,2) # look at the first two rows | |
tail(df, 2) # look at the last two rows | |
# get columns | |
df[1] # same as DF[:A] | |
df[:A] == df[1] | |
mean(df[:A]) | |
# get rows | |
df[2, :] | |
describe(df) | |
# Type ASCIIString | |
# NAs 0 | |
# NA% 0.0% | |
# Unique 2 | |
cumsum(df[:A]) | |
# do to each column | |
df = DataFrame(A = 1:4, B = 5:8) | |
colwise(cumsum, df) | |
# read csv etc http://dataframesjl.readthedocs.org/en/latest/io.html | |
# df = readtable("data.csv") | |
# write to csv | |
# writetable("output.csv", df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment