Created
October 11, 2021 18:07
-
-
Save skypenguins/c54db056fd5b94ba7e9a77f2ad6cc582 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "cc520fd5", | |
"metadata": {}, | |
"source": [ | |
"# Running Julia + Flux.jl on PC Cluster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "9d728dae", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using Revise" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "0da985a9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Julia Version 1.6.0\n", | |
"Commit f9720dc2eb (2021-03-24 12:55 UTC)\n", | |
"Platform Info:\n", | |
" OS: Linux (x86_64-pc-linux-gnu)\n", | |
" CPU: Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz\n", | |
" WORD_SIZE: 64\n", | |
" LIBM: libopenlibm\n", | |
" LLVM: libLLVM-11.0.1 (ORCJIT, skylake-avx512)\n" | |
] | |
} | |
], | |
"source": [ | |
"versioninfo()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "2a6d68a2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Tue Oct 12 02:58:17 2021 \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 440.64.00 Driver Version: 440.64.00 CUDA Version: 11.3 |\n", | |
"|-------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
"|===============================+======================+======================|\n", | |
"| 0 Tesla P100-PCIE... On | 00000000:5E:00.0 Off | 0 |\n", | |
"| N/A 25C P0 26W / 250W | 0MiB / 16280MiB | 0% Default |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
"| 1 Tesla P100-PCIE... On | 00000000:AF:00.0 Off | 0 |\n", | |
"| N/A 27C P0 24W / 250W | 0MiB / 16280MiB | 0% Default |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| Processes: GPU Memory |\n", | |
"| GPU PID Type Process name Usage |\n", | |
"|=============================================================================|\n", | |
"| No running processes found |\n", | |
"+-----------------------------------------------------------------------------+\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"Process(`\u001b[4mnvidia-smi\u001b[24m`, ProcessExited(0))" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"run(`nvidia-smi`)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "25e3dfc0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" total used free shared buff/cache available\n", | |
"Mem: 125Gi 7.3Gi 106Gi 74Mi 11Gi 117Gi\n", | |
"Swap: 63Gi 2.0Mi 63Gi\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"Process(`\u001b[4mfree\u001b[24m \u001b[4m-h\u001b[24m`, ProcessExited(0))" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"run(`free -h`)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "bb930569", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using Flux, Statistics\n", | |
"using Flux.Data: DataLoader\n", | |
"using Flux: onehotbatch, onecold, @epochs\n", | |
"using Flux.Losses: logitcrossentropy\n", | |
"using Base: @kwdef\n", | |
"using CUDA\n", | |
"using MLDatasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "acf7e2c1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"v\"11.3.0\"" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"CUDA.version()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "10ff0e98", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"getdata (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function getdata(args, device)\n", | |
" ENV[\"DATADEPS_ALWAYS_ACCEPT\"] = \"true\"\n", | |
"\n", | |
" # Loading Dataset\n", | |
" xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)\n", | |
" xtest, ytest = MLDatasets.MNIST.testdata(Float32)\n", | |
" \n", | |
" # Reshape Data in order to flatten each image into a linear array\n", | |
" xtrain = Flux.flatten(xtrain)\n", | |
" xtest = Flux.flatten(xtest)\n", | |
"\n", | |
" # One-hot-encode the labels\n", | |
" ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)\n", | |
"\n", | |
" # Create DataLoaders (mini-batch iterators)\n", | |
" train_loader = DataLoader((xtrain, ytrain), batchsize=args.batchsize, shuffle=true)\n", | |
" test_loader = DataLoader((xtest, ytest), batchsize=args.batchsize)\n", | |
"\n", | |
" return train_loader, test_loader\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "eacb500f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"build_model (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function build_model(; imgsize=(28,28,1), nclasses=10)\n", | |
" return Chain( Dense(prod(imgsize), 32, relu),\n", | |
" Dense(32, nclasses))\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "c5cd06a4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"loss_and_accuracy (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function loss_and_accuracy(data_loader, model, device)\n", | |
" acc = 0\n", | |
" ls = 0.0f0\n", | |
" num = 0\n", | |
" for (x, y) in data_loader\n", | |
" x, y = device(x), device(y)\n", | |
" ŷ = model(x)\n", | |
" ls += logitcrossentropy(ŷ, y, agg=sum)\n", | |
" acc += sum(onecold(ŷ) .== onecold(y))\n", | |
" num += size(x)[end]\n", | |
" end\n", | |
" return ls / num, acc / num\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "d2ebc80d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Args" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"@kwdef mutable struct Args\n", | |
" η::Float64 = 3e-4 # learning rate\n", | |
" batchsize::Int = 256 # batch size\n", | |
" epochs::Int = 10 # number of epochs\n", | |
" use_cuda::Bool = true # use gpu (if cuda available)\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "062f1bcc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"train (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"function train(; kws...)\n", | |
" args = Args(; kws...) # collect options in a struct for convenience\n", | |
"\n", | |
" if CUDA.functional() && args.use_cuda\n", | |
" @info \"Training on CUDA GPU\"\n", | |
" CUDA.allowscalar(false)\n", | |
" device = gpu\n", | |
" else\n", | |
" @info \"Training on CPU\"\n", | |
" device = cpu\n", | |
" end\n", | |
"\n", | |
" # Create test and train dataloaders\n", | |
" train_loader, test_loader = getdata(args, device)\n", | |
"\n", | |
" # Construct model\n", | |
" model = build_model() |> device\n", | |
" ps = Flux.params(model) # model's trainable parameters\n", | |
" \n", | |
" ## Optimizer\n", | |
" opt = ADAM(args.η)\n", | |
" \n", | |
" ## Training\n", | |
" for epoch in 1:args.epochs\n", | |
" for (x, y) in train_loader\n", | |
" x, y = device(x), device(y) # transfer data to device\n", | |
" gs = gradient(() -> logitcrossentropy(model(x), y), ps) # compute gradient\n", | |
" Flux.Optimise.update!(opt, ps, gs) # update parameters\n", | |
" end\n", | |
" \n", | |
" # Report on train and test\n", | |
" train_loss, train_acc = loss_and_accuracy(train_loader, model, device)\n", | |
" test_loss, test_acc = loss_and_accuracy(test_loader, model, device)\n", | |
" println(\"Epoch=$epoch\")\n", | |
" println(\" train_loss = $train_loss, train_accuracy = $train_acc\")\n", | |
" println(\" test_loss = $test_loss, test_accuracy = $test_acc\")\n", | |
" end\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "43f6e32f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"┌ Info: Training on CUDA GPU\n", | |
"└ @ Main In[11]:5\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Epoch=1\n", | |
" train_loss = 0.60862046, train_accuracy = 0.8590833333333333\n", | |
" test_loss = 0.59240925, test_accuracy = 0.8683\n", | |
"Epoch=2\n", | |
" train_loss = 0.39855322, train_accuracy = 0.8970833333333333\n", | |
" test_loss = 0.38629392, test_accuracy = 0.9002\n", | |
"Epoch=3\n", | |
" train_loss = 0.33082917, train_accuracy = 0.9103333333333333\n", | |
" test_loss = 0.32028183, test_accuracy = 0.9126\n", | |
"Epoch=4\n", | |
" train_loss = 0.29524896, train_accuracy = 0.9188\n", | |
" test_loss = 0.28696254, test_accuracy = 0.9211\n", | |
"Epoch=5\n", | |
" train_loss = 0.27013686, train_accuracy = 0.9248833333333333\n", | |
" test_loss = 0.26412347, test_accuracy = 0.9244\n", | |
"Epoch=6\n", | |
" train_loss = 0.25219896, train_accuracy = 0.9300333333333334\n", | |
" test_loss = 0.24762471, test_accuracy = 0.929\n", | |
"Epoch=7\n", | |
" train_loss = 0.2376743, train_accuracy = 0.9342333333333334\n", | |
" test_loss = 0.234881, test_accuracy = 0.9335\n", | |
"Epoch=8\n", | |
" train_loss = 0.22610967, train_accuracy = 0.9370833333333334\n", | |
" test_loss = 0.22596319, test_accuracy = 0.935\n", | |
"Epoch=9\n", | |
" train_loss = 0.21589372, train_accuracy = 0.9398833333333333\n", | |
" test_loss = 0.21695527, test_accuracy = 0.9368\n", | |
"Epoch=10\n", | |
" train_loss = 0.20679379, train_accuracy = 0.9425833333333333\n", | |
" test_loss = 0.21015991, test_accuracy = 0.9403\n", | |
"129.410761 seconds (178.31 M allocations: 14.060 GiB, 3.17% gc time, 30.96% compilation time)\n" | |
] | |
} | |
], | |
"source": [ | |
"### Run training \n", | |
"@time train()\n", | |
"# train(η=0.01) # can change hyperparameter" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 1.6.0", | |
"language": "julia", | |
"name": "julia-1.6" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "1.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment