Skip to content

Instantly share code, notes, and snippets.

View michaelbarton's full-sized avatar

Michael Barton michaelbarton

View GitHub Profile
@michaelbarton
michaelbarton / transform.rb
Created October 13, 2010 18:20
Hard coded script to produce genbank file for a genome from IMG output
# Produce some kind of genbank file output
#!/usr/bin/env ruby
require 'bio'
require 'fastercsv'
proteins = Bio::FlatFile.auto('annotation/proteins.faa').inject({}) do |h,p|
h[p.definition.split.first] = p.seq
h
end
#!/usr/bin/Rscript
rm(list=ls())
library(plyr)
# Observed data. Taken from a normal distribution with
# mean 0, and s.d. 2.5
observations = read.csv('distribution.txt',row.names=1)
-
sequence:
source: "contig00001"
-
unresolved:
length: 300
# Scaffold00003 appears to have a non-matching 6,000bp sequence at start
-
sequence:
source: "scaffold00003"
#!/usr/bin/env Rscript
library(ggplot2)
data <- data.frame(
strain = c("R124","Pf0-1","SBW25","Pf-5"),
size = c(4762173,6438405,6722539,7074893)
)
sizes <- factor(1:4)
levels(sizes) <- c("Pf-5","SBW25","Pf0-1","R124")
@michaelbarton
michaelbarton / convert_prodigal_to_yml.rb
Created March 19, 2010 20:17
Parse prodigal gene prediction output
require 'rubygems'
require 'bio'
require 'yaml'
locations = Hash.new{|k,v| k[v] = []}
Bio::FlatFile.open(Bio::GenBank, ARGV[0]).each do |sequence|
sequence.each_cds do |cds|
cds.locations.each do |location|
locations[sequence.definition] << {
'start' => location.from,
@michaelbarton
michaelbarton / print_gap_regions.rb
Created March 2, 2010 21:54
Find gap regions containing Ns in fasta files
#!/usr/bin/env ruby
require 'rubygems'
require 'bio'
def find_gaps(file,buffer=200)
re = Regexp.compile("[ATGC]{#{buffer}}N+[ATGC]{#{buffer}}")
return Bio::FastaFormat.open(file).inject(Hash.new([])) do |hash,entry|
hash[entry.definition.split.first] = entry.seq.scan(re)
hash
#! /usr/bin/Rscript
library(ggplot2)
data = read.csv(file='genome_size.csv',header=T,sep=',')
strains <- as.ordered(1:5)
levels(strains) <- c("KY485","R124","Pf-5","SBW25","Pf0-1")
data$isolate <- factor(data$isolate,strains)
types <- as.ordered(1:2)
+------------+---------+-----------+---------------+
| gc_content | length | n_content | sequence |
+------------+---------+-----------+---------------+
| 60.5 | 1954152 | 1.5 | scaffold00003 |
| 59.8 | 1496925 | 1.0 | scaffold00007 |
| 60.2 | 991882 | 0.7 | scaffold00004 |
| 61.0 | 799428 | 0.2 | scaffold00002 |
| 61.2 | 651550 | 0.3 | scaffold00006 |
| 60.2 | 334801 | 0.7 | scaffold00001 |
| 53.7 | 43794 | 0.0 | scaffold00008 |
require 'open-uri'
FILE = 'SBW25.fna'
URL = "ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Pseudomonas_fluorescens_SBW25/AM181176.fna"
READS = [200_000]
N_SIMULATIONS = 5
METASIM = 'metasim'
desc "Fetches SBW25 genome sequence"
library(ggplot2)
data <- read.csv("n50.csv")
plot <- ggplot(data, aes(x=read_per_genome/1000,y=n50/1000)) +
geom_line() +
scale_x_continuous("Reads per 6.9 MB genome (000s)") +
scale_y_continuous("N50 contig size (KBp)")
postscript("out/n50.eps",width=10,height=6,onefile=FALSE,horizontal=FALSE, paper = "special",colormodel="rgb")
print(plot)