Skip to content

Instantly share code, notes, and snippets.

View philippmuench's full-sized avatar

Philipp Münch philippmuench

View GitHub Profile
#!/usr/bin/env Rscript
# Script to find exact matches between evaluation and training datasets
# Identifies which samples from evaluation set are identical to samples in training set
# Load necessary libraries
if (!requireNamespace("Matrix", quietly = TRUE)) {
install.packages("Matrix")
}
library(Matrix)
@philippmuench
philippmuench / LLaMA.R
Created October 15, 2024 23:03 — forked from t-kalinowski/LLaMA.R
LLaMA implemented in R Tensorflow and Keras
## Setup
Sys.setenv(CUDA_VISIBLE_DEVICES='')
options(tensorflow.extract.warn_tensors_passed_asis = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(glue)
library(envir)
library(tensorflow)
@philippmuench
philippmuench / download_ref_from_list.py
Created October 14, 2024 12:21
download genomes from NCBI
import os
import hashlib
from pathlib import Path
import requests
import logging
from colorama import Fore, Style, init
import gzip
import shutil
import time
import random
@philippmuench
philippmuench / Snakefile
Last active September 3, 2024 10:25
Snakemake file for gene list generation for Muench et all, 2024 manuscript
import os
import random
# Get a list of all FASTA files in the bacdive_gff folder
FASTA_FILES, = glob_wildcards("fasta/{fasta_file}.fasta")
rule all:
input:
expand("gff/{fasta_file}.gff", fasta_file=FASTA_FILES),
expand("reformatted_gff_shuffled/{fasta_file}.gff", fasta_file=FASTA_FILES),
@philippmuench
philippmuench / check_fasta.py
Created July 13, 2023 14:06
check for malformed file
import os
import argparse
import random
def is_fasta(filename):
try:
with open(filename, 'r') as f:
first_line = f.readline().strip()
if not first_line:
return 'empty'
## app.R ##
library(shinydashboard)
library(shiny)
library(keras)
library(deepG)
library(ggplot2)
library(dplyr)
library(DT)
library(hdf5r)
library(plotly)
---
title: "GenomeNet Viewer"
output:
flexdashboard::flex_dashboard:
orientation: rows
social: menu
theme: united #cerulean
source_code: embed
runtime: shiny
---
@philippmuench
philippmuench / gist:6b9bbb9f9f987ab22efb573f9f19160f
Created August 3, 2020 13:39
train for wavenet binary target
#' @title Trains a (mostly) LSTM model on genomic data. Designed for developing genome based language models (GenomeNet)
#'
#' @description
#' Depth and number of neurons per layer of the netwok can be specified. First layer can be a Convolutional Neural Network (CNN) that is designed to capture codons.
#' If a path to a folder where FASTA files are located is provided, batches will ge generated using an external generator which
#' is recommended for big training sets. Alternative, a dataset can be supplied that holds the preprocessed batches (generated by \code{preprocessSemiRedundant()})
#' and keeps them in RAM. Supports also training on instances with multiple GPUs and scales linear with number of GPUs present.
#' @param train_type Either "lm" for language model, "label_header" or "label_folder". Language model is trained to predict next character in sequence.
#' label_header/label_folder are trained to predict a corresponding class, given a sequence as input. If "label_header", class will be read from f
trainMinimalFunctionalAPI <- function(path = "example_files/fasta") {
library(wavenet)
message("Initialize model! This can take a few minutes.")
maxlen <- 1000
input <- keras::layer_input(batch_shape = c(64, maxlen, 6))
# https://github.com/ibab/tensorflow-wavenet/blob/master/wavenet/ops.py#L46
first <- keras::layer_conv_1d(
trainMinimalFunctionalAPI <- function(path = "example_files/fasta") {
message("Initialize model! This can take a few minutes.")
input <- keras::layer_input(batch_shape = c(256, 50, 6))
cnn <-
keras::layer_conv_1d(
object = input,
kernel_size = 3,