Matt Shirley mdshw5

Data Scientist @Novartis, CSO Sturge-Weber Foundation. Oncology and Rare Disease Research [email protected]

mdshw5 / answer.py

Created March 10, 2016 17:22

biostars 75693

	import simplesam
	with simplesam.Reader(open('input.sam')) as sam:
	with simplesam.Writer(open('output.sam', 'w'), sam.header) as fixed:
	for read in sam:
	if bool(read.flag & 0x800):
	read.flag -= 0x800
	if not read.secondary:
	read.flag += 0x100
	fixed.write(read)

mdshw5 / answer.py

Last active February 20, 2016 12:39

biostars 177781

	import pyfaidx

	with pyfaidx.Fasta("file.fa") as peg_fasta:
	with open("file.sorted.fa", 'w') as sorted_fasta:
	for id in sorted(peg_fasta.keys()):
	sorted_fasta.write('>' + peg_fasta[id].long_name)
	sorted_fasta.write(str(peg_fasta[id]))

mdshw5 / answer.py

Last active January 28, 2016 03:20

biostars 174423

	import pyfaidx
	import simplesam

	with pyfaidx.Fasta('genome.fasta') as fasta, simplesam.Reader(open('orfH9_offtarget.sam')) as sam:
	for read in sam:
	if read.mapped and not read.secondary:
	# grab the reference sequnce from indexed FASTA
	zero_index = read.pos - 1
	sequence_chunk = fasta[read.rname][zero_index:zero_index+20]
	# or get the reference sequence from the SAM MD tag

mdshw5 / answer.py

Last active January 19, 2016 14:35

biostars 173201

	from simplesam import Reader
	from pyfaidx import Fasta

	with Reader(open('library.bam', 'r')) as sam_file, Fasta('hg38.fa', as_raw=True) as hg38:
	for read in sam_file:
	if read.mapped:
	# might also want to handle read.reverse here
	prior_pos = read.pos - 2 # read.pos is 1-based
	prior_base = hg38[read.rname][prior_pos]

mdshw5 / answer.py

Created January 14, 2016 03:07

biostars 172641

	from pyfaidx import Fasta

	genes = Fasta('Genome.fasta')

	with open('chr02_18s', 'w') as f:
	seqFile = genes['chr02'][146062:148216]
	f.write('>' + seqFile.name)
	f.write(seqFile.seq) # or str(seqFile)

mdshw5 / answer.py

Last active December 16, 2015 01:53

biostars 169723

	from pyfaidx import Fasta

	file_1 = {}
	with open('file1.txt', 'r') as ids:
	for line in ids:
	key, value = line.strip().split('\t')
	file_1[key] = value

	file_2 = Fasta('file2.fa', key_function=lambda key: file_1[key])

mdshw5 / answer.py

Created November 11, 2015 03:00

biostars 165335

	from pyfaidx import Fasta

	name_map = {}
	with open('newnames.txt') as newnames:
	next(newnames) # remove header
	for line in newnames:
	old, new = line.rstrip().split()
	name_map[old] = new

	with open('seqnew.fa', 'w') as new_fasta:

mdshw5 / answer.py

Created August 17, 2015 22:54

Biostars 154785

	import random
	import sys
	from pyfaidx import Fasta

	n = 10
	faa = Fasta("file.faa")
	for sample in random.sample(faa, n):
	print(sample.name)
	for line in sample:
	print(line)

mdshw5 / pileup.py

Created May 20, 2015 02:04

Simple pileup-like script

	#!/usr/bin/env python

	import sys
	import argparse
	import pkg_resources
	from collections import deque
	from collections import Counter
	try:
	from collections import OrderedDict
	except ImportError: #python 2.6

mdshw5 / example.py

Created May 11, 2015 17:24

biostars 141739

	from pyfaidx import FastaVariant

	consensus = FastaVariant('reference.fasta', 'sample1.vcf.gz', het=True, hom=True)
	chrom = 'chr1'

	seq = consensus[chrom][0:8]
	print(seq) # AGTGCG

	# if you don't want to invariant sites masked, you're good to go. otherwise: