Francisco Lima monogenea

Data scientist and blogger

monogenea / 2-umap.py

Last active October 10, 2020 13:14

	# Check first five columns in matrix.csv
	#!cut -d, -f-5 matrix.csv \| head

	# Import data with Bash command discarding first column
	matrix = dt.fread(cmd='cut -d, -f2- matrix.csv',
	header=True, sep=',', columns=dt.int32) # ~7 GB (76533, 50281)
	# Import metadata
	metadata = pd.read_csv('metadata.csv')

monogenea / 1-umap.py

Last active October 10, 2020 13:09

monogenea / 4-audioClass.R

Created April 6, 2020 05:55

	# read, downsample, clip, mel spec, normalize and remove noise
	melspec <- function(x, start, end){
	mp3 <- readMP3(filename = x) %>%
	extractWave(xunit = "time",
	from = start, to = end)

	# return log-spectrogram with 256 Mel bands and compression
	sp <- melfcc(mp3, nbands = 256, usecmp = T,
	spec_out = T,
	hoptime = (end-start) / 256)$aspectrum

monogenea / 12-audioClass.R

Last active April 6, 2020 05:53

	# Test set prediction
	predXProb <- predict(model, test$X)
	predXClass <- speciesClass[apply(predXProb, 1, which.max)]
	trueXClass <- speciesClass[apply(test$Y, 1, which.max)]

	# Plot confusion matrix
	confMatTest <- confusionMatrix(data = factor(predXClass, levels = speciesClass),
	reference = factor(trueXClass, levels = speciesClass))

	pheatmap(confMatTest$table, cluster_rows = F, cluster_cols = F,

monogenea / 11-audioClass.R

Last active April 6, 2020 05:53

	# Grep species, set colors for heatmap
	speciesClass <- gsub(colnames(train$Y), pat = "species", rep = "")
	cols <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdGy")))

	# Validation predictions
	predProb <- predict(model, val$X)
	predClass <- speciesClass[apply(predProb, 1, which.max)]
	trueClass <- speciesClass[apply(val$Y, 1, which.max)]

	# Plot confusion matrix

monogenea / 10-audioClass.R

Last active April 6, 2020 05:53

	# Print summary
	summary(model)
	model %>% compile(optimizer = optimizer_adam(decay = 1e-5),
	loss = "categorical_crossentropy",
	metrics = "accuracy")

	history <- fit(model, x = train$X, y = train$Y,
	batch_size = 16, epochs = 50,
	validation_data = list(val$X, val$Y))

monogenea / 9-audioClass.R

Last active April 6, 2020 05:54

	# Build model
	model <- keras_model_sequential() %>%
	layer_conv_2d(input_shape = dim(train$X)[2:4],
	filters = 16, kernel_size = c(3, 3),
	activation = "relu") %>%
	layer_max_pooling_2d(pool_size = c(2, 2)) %>%
	layer_dropout(rate = .2) %>%

	layer_conv_2d(filters = 32, kernel_size = c(3, 3),
	activation = "relu") %>%

monogenea / 8-audioClass.R

Last active April 6, 2020 05:54

	# Fri Feb 7 15:49:46 2020 ------------------------------
	setwd("~/Documents/Tutorials/birdsong")
	library(keras)
	use_condaenv("plaidml")
	use_backend("plaidml")
	k_backend() # plaidml
	library(tidyverse)
	library(caret)
	library(e1071)
	library(pheatmap)

monogenea / 7-audioClass.R

Last active April 6, 2020 05:54

	# Plot spectrogram from random training sample - range is 0-22.05 kHz
	image(train$X[sample(dim(train$X)[1], 1),,,],
	xlab = "Time (s)",
	ylab = "Frequency (kHz)",
	axes = F)
	# Generate mel sequence from Hz points, standardize to plot
	freqs <- c(0, 1, 5, 15, 22.05)
	mels <- 2595 * log10(1 + (freqs*1e3) / 700) # https://en.wikipedia.org/wiki/Mel_scale
	mels <- mels - min(mels)
	mels <- mels / max(mels)

monogenea / 6-audioClass.R

Last active April 6, 2020 05:54

	# Define targets and augment data
	target <- model.matrix(~0+species)

	targetTrain <- do.call("rbind", lapply(1:(dim(Xtrain)[1]/length(fnamesTrain)),
	function(x) target[-c(valIdx, testIdx),]))
	targetVal <- do.call("rbind", lapply(1:(dim(Xval)[1]/length(fnamesVal)),
	function(x) target[valIdx,]))
	targetTest <- do.call("rbind", lapply(1:(dim(Xtest)[1]/length(fnamesTest)),
	function(x) target[testIdx,]))
	# Assemble Xs and Ys