explodecomputer · June 30, 2015 00:41
diff --git a/mouse_geno.sh b/mouse_geno.sh
 #!/bin/bash

 # Objective: Take data/mouse.csv and convert to binary plink format
 # Method: It is currently in SNPs=rows and IDs=columns format. So first convert to tped format and then convert to binary plink format using the plink programme
 # See tped format here:
 # http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr

 # 1. Export excel format to text file
 # - Just export sheet 2 to a Windows format csv file
 # - This means 'comma separated file', e.g. the columns are separated by commas

 # 2. Make tfam file
 # - Take the header and make the following columns:
 # - Family ID, Individual ID, Father ID, Mother ID, sex, phenotype
 # - In this case we will use the ID as the first two columns and set everything else as missing


 # Remove weird carriage return characters from the file
 cat data/mouse.csv | col -b > data/mouse_clean.csv

 # Get the first line | convert commas into newlines | create desired format by excluding the first 3 rows, then printing the IDs in the first 2 columns, 2 columns of zeros, and two columns of -9s. Finally, remove the > from the IDs and save it in a new file called data/mouse.tfam
 head -n 1 data/mouse_clean.csv | tr ',' '\n' | awk '{if (NR > 3) {print $1, $1, "0", "0", "-9", "-9"}}' | sed 's/>//g' > data/mouse.tfam


 # 3. Make tped file
 # - Remove first line
 # - Rearrange columns to be chr | SNP | genetic position | physical position | genotypes

 # Create the SNP information - print the chr (col2), SNP (col1), a column of zeros for the genetic position, and the physical positions (col3)
 # Save it in a temporary file
 awk 'BEGIN {FS = ","} {if(NR > 1) {print $2, $1, "0", $3}}' data/mouse_clean.csv > temp1

 # Print the genotypes by excluding the first row and the first 4 columns, and replacing commas with spaces
 # Save in another temporary file
 cut -d "," -f 4- data/mouse_clean.csv | sed 1d | tr ',' ' ' > temp2

 # Put spaces between the genotype alleles and replace NC with 0 0
 sed "s/AA/A A/g" temp2 | sed "s/AB/A B/g"| sed "s/BB/B B/g" | sed "s/BA/B A/g" | sed "s/NC/0 0/g"> temp3

 # Paste the two temporary files together to make the data/mouse.tped file
 paste -d " " temp1 temp3 > data/mouse.tped

 # Remove temporary files
 rm temp1 temp2 temp3

 # 4. Convert to binary plink
 # Plink can now read in the tped format data and convert it into a standard binary plink format
 # Plink works by using 'flags' to tell it what you want it to do. A flag starts with --. In this case we are saying:

 # --tfile data/mouse     Read in the transposed file format, file name is data/mouse
 #                        Because it knows it is transposed format you don't need to
 #                        specify the actual file endings (.tped and .tfam), it looks
 #                        for them automatically
 # --make-bed             Create a binary plink format file from the transposed file
 # --out data/mouse       Save the new format to this file name. It will create three
 #                        files: .bed, .bim, .fam.
 # --allow-extra-chr      There are a few strange chomosome IDs at the moment. This
 #                        flag tells plink to allow them. Normally it looks for 1-22
 #                        and X, Y, etc. Note - X chr is recoded as 23

 plink --tfile data/mouse --make-bed --out data/mouse --allow-extra-chr


 # Now you have the data in binary plink format. Have a look at it:

 head data/mouse.bim
 # This is the SNP data - it shows chr, SNP, genetic position, physical position, allele1, allele2
 # To find how many SNPs there are:
 wc -l data/mouse.bim

 head data/mouse.fam
 # This is the individual information - it shows family ID, individual ID, father ID, mother ID, sex, phenotype
 # To find how many IDs there are:
 wc -l data/mouse.fam

 # Finally, there is the data/mouse.bed. This encodes the genotypes in compressed format, so you can't read them. If you are moving this data around make sure you move the .bed, .bim and .fam files all together, plink reads all three of them for any one analysis.


 # 5. Obtain sex information
 # Gonna predict the sex using the X chromosome data, but this might not be completely accurate. If you have full information on sex then better to use that data instead of this method
 # - Use plink to infer sex from the X chromosome
 # - Take the inferred sex from column 4 and put it in the .fam file

 # Make a copy of the original fam file
 cp data/mouse.fam data/mouse.fam.nosex

 # Get plink to read in the binary plink format and guess the sex from the X chr
 plink --bfile data/mouse --check-sex --out data/mouse --allow-extra-chr

 # Extract the predicted sex column from the output
 awk '{if(NR > 1) {print $4}}' data/mouse.sexcheck > temp1

 # Add that column to the end of the fam file
 paste -d " " data/mouse.fam temp1 > temp2

 # Recreate the fam file with the correct sex column
 awk '{print $1, $2, $3, $4, $7, $6}' temp2 > data/mouse.fam


 # 6. Remove SNPs with weird chromosome codes

 # This finds all lines that don't start with "1 ", "2 ", ..., "23 " and then uses awk to print the SNP IDs only
 egrep -v '^[1-2]?[0-9]\s' data/mouse.bim | awk '{ print $2}' > data/mouse.removesnps

 # Now use PLINK to read in the binary plink data, exclude the SNPs we identified with weird chromosomes, and save the data without those SNPs in a new file called data/mouse_clean.bed/.bim/.fam

 plink --bfile data/mouse --exclude data/mouse.removesnps --make-bed --out data/mouse_clean --allow-extra-chr


 # 7. Calculate allele frequencies
 # Now you can use PLINK to do various things, such as calculating allele frequencies, or a lot of other analyses

 plink --bfile data/mouse_clean --freq --out data/mouse_clean
	#!/bin/bash

	# Objective: Take data/mouse.csv and convert to binary plink format
	# Method: It is currently in SNPs=rows and IDs=columns format. So first convert to tped format and then convert to binary plink format using the plink programme
	# See tped format here:
	# http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr

	# 1. Export excel format to text file
	# - Just export sheet 2 to a Windows format csv file
	# - This means 'comma separated file', e.g. the columns are separated by commas

	# 2. Make tfam file
	# - Take the header and make the following columns:
	# - Family ID, Individual ID, Father ID, Mother ID, sex, phenotype
	# - In this case we will use the ID as the first two columns and set everything else as missing


	# Remove weird carriage return characters from the file
	cat data/mouse.csv \| col -b > data/mouse_clean.csv

	# Get the first line \| convert commas into newlines \| create desired format by excluding the first 3 rows, then printing the IDs in the first 2 columns, 2 columns of zeros, and two columns of -9s. Finally, remove the > from the IDs and save it in a new file called data/mouse.tfam
	head -n 1 data/mouse_clean.csv \| tr ',' '\n' \| awk '{if (NR > 3) {print $1, $1, "0", "0", "-9", "-9"}}' \| sed 's/>//g' > data/mouse.tfam


	# 3. Make tped file
	# - Remove first line
	# - Rearrange columns to be chr \| SNP \| genetic position \| physical position \| genotypes

	# Create the SNP information - print the chr (col2), SNP (col1), a column of zeros for the genetic position, and the physical positions (col3)
	# Save it in a temporary file
	awk 'BEGIN {FS = ","} {if(NR > 1) {print $2, $1, "0", $3}}' data/mouse_clean.csv > temp1

	# Print the genotypes by excluding the first row and the first 4 columns, and replacing commas with spaces
	# Save in another temporary file
	cut -d "," -f 4- data/mouse_clean.csv \| sed 1d \| tr ',' ' ' > temp2

	# Put spaces between the genotype alleles and replace NC with 0 0
	sed "s/AA/A A/g" temp2 \| sed "s/AB/A B/g"\| sed "s/BB/B B/g" \| sed "s/BA/B A/g" \| sed "s/NC/0 0/g"> temp3

	# Paste the two temporary files together to make the data/mouse.tped file
	paste -d " " temp1 temp3 > data/mouse.tped

	# Remove temporary files
	rm temp1 temp2 temp3

	# 4. Convert to binary plink
	# Plink can now read in the tped format data and convert it into a standard binary plink format
	# Plink works by using 'flags' to tell it what you want it to do. A flag starts with --. In this case we are saying:

	# --tfile data/mouse Read in the transposed file format, file name is data/mouse
	# Because it knows it is transposed format you don't need to
	# specify the actual file endings (.tped and .tfam), it looks
	# for them automatically
	# --make-bed Create a binary plink format file from the transposed file
	# --out data/mouse Save the new format to this file name. It will create three
	# files: .bed, .bim, .fam.
	# --allow-extra-chr There are a few strange chomosome IDs at the moment. This
	# flag tells plink to allow them. Normally it looks for 1-22
	# and X, Y, etc. Note - X chr is recoded as 23

	plink --tfile data/mouse --make-bed --out data/mouse --allow-extra-chr


	# Now you have the data in binary plink format. Have a look at it:

	head data/mouse.bim
	# This is the SNP data - it shows chr, SNP, genetic position, physical position, allele1, allele2
	# To find how many SNPs there are:
	wc -l data/mouse.bim

	head data/mouse.fam
	# This is the individual information - it shows family ID, individual ID, father ID, mother ID, sex, phenotype
	# To find how many IDs there are:
	wc -l data/mouse.fam

	# Finally, there is the data/mouse.bed. This encodes the genotypes in compressed format, so you can't read them. If you are moving this data around make sure you move the .bed, .bim and .fam files all together, plink reads all three of them for any one analysis.


	# 5. Obtain sex information
	# Gonna predict the sex using the X chromosome data, but this might not be completely accurate. If you have full information on sex then better to use that data instead of this method
	# - Use plink to infer sex from the X chromosome
	# - Take the inferred sex from column 4 and put it in the .fam file

	# Make a copy of the original fam file
	cp data/mouse.fam data/mouse.fam.nosex

	# Get plink to read in the binary plink format and guess the sex from the X chr
	plink --bfile data/mouse --check-sex --out data/mouse --allow-extra-chr

	# Extract the predicted sex column from the output
	awk '{if(NR > 1) {print $4}}' data/mouse.sexcheck > temp1

	# Add that column to the end of the fam file
	paste -d " " data/mouse.fam temp1 > temp2

	# Recreate the fam file with the correct sex column
	awk '{print $1, $2, $3, $4, $7, $6}' temp2 > data/mouse.fam


	# 6. Remove SNPs with weird chromosome codes

	# This finds all lines that don't start with "1 ", "2 ", ..., "23 " and then uses awk to print the SNP IDs only
	egrep -v '^[1-2]?[0-9]\s' data/mouse.bim \| awk '{ print $2}' > data/mouse.removesnps

	# Now use PLINK to read in the binary plink data, exclude the SNPs we identified with weird chromosomes, and save the data without those SNPs in a new file called data/mouse_clean.bed/.bim/.fam

	plink --bfile data/mouse --exclude data/mouse.removesnps --make-bed --out data/mouse_clean --allow-extra-chr


	# 7. Calculate allele frequencies
	# Now you can use PLINK to do various things, such as calculating allele frequencies, or a lot of other analyses

	plink --bfile data/mouse_clean --freq --out data/mouse_clean