Created
October 20, 2013 13:22
-
-
Save crazyhottommy/7069590 to your computer and use it in GitHub Desktop.
ChIP-seq TSS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# group the genes according to expression level | |
# analyze RNAseq data by counting tags for each gene using HTSeq.scripts.count or use bedtools muticov | |
# it genrates a file (K562_htseq_count.out.clean) with two columns, column 1 are gene names, column 2 are | |
#counts that mapped to all the exons of the same gene. | |
# compare the counts from different methods! and visualize them in IGV browser. | |
# top 30% midum 30% and low 30% gene names were obtained by linux command line | |
# sort -k2 -nrs K562_htseq_count.out.clean | wc -l | |
# sort -k2 -nrs K562_htseq_count.out.clean | head -n7992 > top30_percent.txt | |
# note that the counts are orignial raw data which need to be normalized, but for this simple test, the | |
# counts are proportional to the expression levles especially in the same sample. | |
# to compare gene expression from different samples, need to take account in the library size, the | |
# depth of sequencing and the length of genes.(longer genes have more counts by chance) | |
# to calculate the differential expression of genes, use the bioconductor(R) package DEseq which was written | |
# by Simon Andrew, the same guy wrote the HTSeq!(Seqmonk is written by hime also...) or differential exon usage by DEXseq package. | |
# samtools was used to prepare sorted and indexed bam file. bam file was converted to sam file by samtools also. | |
# HTSeq.scripts.count only takes sam file as input. | |
# 03/12/13 modified for shell use | |
import sys | |
import os | |
Usage='''this program takes in five input files, sortedbamfile, | |
GFF file and grouped gene lists. it generates the transcription factor | |
binding or histone modification profiling around the TSS in a genome wide scale. | |
How to use: | |
in a shell type | |
HTSeq_TSS4.py sorted.bam hg19.gtf top30_gene.txt medium30_gene.txt low30_gene.txt | |
''' | |
if len(sys.argv) <2 : | |
print Usage | |
else: | |
sortedbamfile=sys.argv[1] | |
gtf_file=sys.argv[2] | |
top30_genes=sys.argv[3] | |
medium30_genes=sys.argv[4] | |
low30_genes=sys.argv[5] | |
def TSS_Profile(ifile1,ifile2,ifile3): | |
'''read in three files, ifile1 is the sortedbamfile prepared by samtool | |
ifile2 is the GFF file, ifile3 is the grouped gene list''' | |
import sys | |
import HTSeq | |
import numpy | |
import itertools | |
sortedbamfile=HTSeq.BAM_Reader(ifile1) | |
gtffile=HTSeq.GFF_Reader(ifile2) | |
group_genes=open(ifile3) | |
halfwinwidth=3000 | |
fragmentsize=200 | |
gtf_dict={} #make a dictionary, key is the gene_name, value is the iv.start_d_as_pos | |
for feature in gtffile: | |
gtf_dict[feature.name]=feature.iv.start_d_as_pos | |
tsspos=set() #extract TSS for each group of genes | |
for line in group_genes: | |
linelist=line.split() | |
try: | |
tsspos.add(gtf_dict[linelist[0]]) | |
except: | |
continue #in case the key is not in the gtf file | |
# highexpr=set() | |
#for line in open(highexpression): | |
# highexpr.add( line.split()[0] ) | |
# for feature in gtf_file: | |
# if feature.name in highexpr: | |
# do_something() | |
#As you can see, using a container to store the information from one file in memory | |
#allows you to process your two files separately and so avoid the nested loop. This, | |
# of course is a very general and basic design pattern that you will encounter very often. | |
#(The different kinds of data containers that programming languages offer and how and when | |
#to use them is probably the next thing you need to study to further your programming skills.) | |
for tss in itertools.islice(tsspos,10): | |
sys.stderr.write("first 10 transcription start sites in %s are %s \n" %(ifile3,tss)) | |
profile=numpy.zeros(2*halfwinwidth, dtype='i') | |
for p in tsspos: | |
try: | |
window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth+fragmentsize,".") | |
for almnt in sortedbamfile[window]: | |
almnt.iv.length=fragmentsize | |
if p.strand=="+": | |
start_in_window=almnt.iv.start- p.pos +halfwinwidth | |
end_in_window =almnt.iv.end - p.pos +halfwinwidth | |
else: | |
start_in_window=p.pos+halfwinwidth-almnt.iv.end | |
end_in_window =p.pos+halfwinwidth-almnt.iv.start | |
start_in_window=max(start_in_window,0) | |
end_in_window=min(end_in_window, 2*halfwinwidth) | |
if start_in_window >= 2*halfwinwidth or end_in_window <0: | |
continue | |
profile[start_in_window : end_in_window] +=1 | |
except: | |
continue | |
return profile | |
halfwinwidth=3000 | |
profile1=TSS_Profile(sortedbamfile,gtf_file,top30_genes) | |
profile2=TSS_Profile(sortedbamfile,gtf_file,medium30_genes) | |
profile3=TSS_Profile(sortedbamfile,gtf_file,low30_genes) | |
from matplotlib import pyplot | |
import numpy | |
line1=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile1, color="red",label="high") | |
line2=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile2, color="blue",label="medium") | |
line3=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile3, color="green",label="low") | |
pyplot.legend() | |
pyplot.xlabel("distance related to TSS bp") | |
pyplot.ylabel("tag density") | |
pyplot.title("%s enrichment around TSS in different expression gene groups" %sortedbamfile) | |
pyplot.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment