Skip to content

Instantly share code, notes, and snippets.

View BenLangmead's full-sized avatar

Benjamin Langmead BenLangmead

View GitHub Profile
@BenLangmead
BenLangmead / DeBruijn.py
Last active September 26, 2023 15:39
Demonstration of de Bruijn graph construction and Eulerian path/cycle finding.
class DeBruijnGraph:
""" A de Bruijn multigraph built from a collection of strings.
User supplies strings and k-mer length k. Nodes of the de
Bruijn graph are k-1-mers and edges correspond to the k-mer
that joins a left k-1-mer to a right k-1-mer. """
@staticmethod
def chop(st, k):
""" Chop a string up into k mers of given length """
for i in xrange(0, len(st)-(k-1)):
def neighbors1mm(kmer, alpha):
""" Generate all neighbors at Hamming distance 1 from kmer """
for j in xrange(len(kmer)-1, -1, -1):
for c in alpha:
if c == kmer[j]: continue
yield kmer[:j] + c + kmer[j+1:]
def correct1mm(read, k, kmerhist, alpha, thresh):
""" Return an error-corrected version of read. k = k-mer length
for k-mer count profile. kmerhist maps distinct kmers to their
import re
import sys
import string
import numpy
import math
def fastaKmerParser(fns, k):
""" Parse all (overlapping) k-mers of length k from all the FASTA
filenames provided. """
for fn in fns:
@BenLangmead
BenLangmead / HMM.py
Last active December 16, 2015 05:38
import numpy
class HMM(object):
""" Simple Hidden Markov Model implementation. User provides
transition, emission and initial probabilities as dictionaries.
Transition and emission probabilities are expressed as a dict
mapping a 2-character code onto the floating-point probability
for that table entry. States and emissions are represented
with single characters. """
@BenLangmead
BenLangmead / gtf_flux_fix.py
Created September 2, 2013 15:50
Transforms a genes.gtf file from an iGenomes tarball into a new .gtf file formatted more like Flux Simulator expects.
"""
gtf_flux_fix.py
Author: Ben Langmead ([email protected])
Date: Sept 2, 2013
Remove GTF records for sequence IDs that don't appear in any of the specified
FASTA files. Move the transcript_id attribute into the first attribute
position. Output transformed GTF to stdout.
"""
{
"metadata": {
"name": "CG_001_StringBasics1"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
{
"metadata": {
"name": "CG_002_NaiveMatching1"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
{
"metadata": {
"name": "003_CG_InvertedIndex1"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
{
"metadata": {
"name": "004_CG_InvertedIndex2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
{
"metadata": {
"name": "CG_InvertedIndex2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{