Ming Tang crazyhottommy

compare:

cat raw_expression.txt | head
cat raw_expression.txt | awk -F"\t" '{if(NR==1) $1="NAME"FS$1}1' OFS="\t" | head

$1 is the first field for each line.
$1="NAME" FS $1 assigns NAME to the first field and seperate by a field seprator (FS), which is a tab.
1 is always TRUE, so awk prints out the rest of the lines.

then, add a second column with a dummy "na":

	#The X axis is -3 kb to 3 kb around TSS.

	#It turns out that the heatmap.2 function use default hclust ( Hierachical Clustering) to cluster the #matrix.
	#alternatively, we can use K-means clustering to cluster the data and to see what's the pattern look like.

	km<- kmeans(m1,2) # determine how many cluster you want, I specify 2 here

	m1.kmeans<- cbind(m1, km$cluster) # combine the cluster with the matrix

	dim(m1.kmeans)

	m.row.sum<- cbind(m1, rowSums(m1))
	o1<- rev(order(m.row.sum[,602]))

	m.row.sum<- m.row.sum[o1,]

	bk = unique(c(seq(-0.1,3, length=100),seq(3,10.35,length=100)))

	hmcols<- colorRampPalette(c("white","red"))(length(bk)-1)

	pheatmap( m.row.sum[,1:601], cluster_rows = F, cluster_cols = F, col= hmcols, breaks = bk, legend=FALSE, show_rownames=FALSE, show_colnames=FALSE)

	#! /bin/bash

	# this script is used to decompose vcf file and normalize the vcf file
	# see here https://gemini.readthedocs.org/en/latest/index.html
	# and load it to gemini database

	# put the following three lines to every bash script to catch errors
	set -e
	set -u
	set -o pipefail -o errexit -o nounset

	# output of mutect run for each chromosome
	# the first line is the mutect version, the second line is the header
	# keep the header and filter mutect results based on 10th and 35th columns.

	for file in *.call.txt
	do (sed -n '2p' $file; awk -F "\t" '$10=="COVERED" && $35=="KEEP"' $file) > ./keep/"${file%txt}"keep.txt
	done

	for i in {1..22} X Y
	do
	same=$(comm -12 <(cut -f1,2,4,5 keep_original_bam/TCGA-06-0125_$i.call.keep.txt \| sort) <(cut -f1,2,4,5 keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt\| sort) \| wc -l \| tr -d " ")
	original=`wc -l keep_original_bam/TCGA-06-0125_$i.call.keep.txt \| awk '{print $1}'`
	realigned=`wc -l keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt \| awk '{print $1}'`
	printf "chromosome $i, original_bam:$original, realigned_bam:$realigned, common:$same \n"
	done

	## imagine we have a file with one line header, and we want to keep the header after sorting
	## use subshells http://bash.cyberciti.biz/guide/What_is_a_Subshell%3F
	(sed -n '1p' your_file; cat your_file \| sed '1d' \| sort) > sort_header.txt

	## if you have two header lines and want to keep both of them:
	(sed -n '1,2p' your_file; cat your_file \| sed '1,2d' \| sort) > sort_header.txt

	## if you have many lines starting with "#" as header, like vcf files
	(grep "^#" my_vcf; grep -v "^#" my_vcf \| sort -k1,1V -k2,2n) > sorted.vcf

	#! /bin/bash

	### This script is used to filter the vcf file produced by lumpy/speedseq

	set -e
	set -u
	set -o pipefail

	function usage() {
	echo "

	chmod a+x vcfsort.sh
	vcfsort.sh trio.trim.vep.vcf.gz