Created
June 30, 2015 00:41
-
-
Save explodecomputer/745caff54ca6d6a271b6 to your computer and use it in GitHub Desktop.
Luke's mouse genotypes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Objective: Take data/mouse.csv and convert to binary plink format | |
# Method: It is currently in SNPs=rows and IDs=columns format. So first convert to tped format and then convert to binary plink format using the plink programme | |
# See tped format here: | |
# http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr | |
# 1. Export excel format to text file | |
# - Just export sheet 2 to a Windows format csv file | |
# - This means 'comma separated file', e.g. the columns are separated by commas | |
# 2. Make tfam file | |
# - Take the header and make the following columns: | |
# - Family ID, Individual ID, Father ID, Mother ID, sex, phenotype | |
# - In this case we will use the ID as the first two columns and set everything else as missing | |
# Remove weird carriage return characters from the file | |
cat data/mouse.csv | col -b > data/mouse_clean.csv | |
# Get the first line | convert commas into newlines | create desired format by excluding the first 3 rows, then printing the IDs in the first 2 columns, 2 columns of zeros, and two columns of -9s. Finally, remove the > from the IDs and save it in a new file called data/mouse.tfam | |
head -n 1 data/mouse_clean.csv | tr ',' '\n' | awk '{if (NR > 3) {print $1, $1, "0", "0", "-9", "-9"}}' | sed 's/>//g' > data/mouse.tfam | |
# 3. Make tped file | |
# - Remove first line | |
# - Rearrange columns to be chr | SNP | genetic position | physical position | genotypes | |
# Create the SNP information - print the chr (col2), SNP (col1), a column of zeros for the genetic position, and the physical positions (col3) | |
# Save it in a temporary file | |
awk 'BEGIN {FS = ","} {if(NR > 1) {print $2, $1, "0", $3}}' data/mouse_clean.csv > temp1 | |
# Print the genotypes by excluding the first row and the first 4 columns, and replacing commas with spaces | |
# Save in another temporary file | |
cut -d "," -f 4- data/mouse_clean.csv | sed 1d | tr ',' ' ' > temp2 | |
# Put spaces between the genotype alleles and replace NC with 0 0 | |
sed "s/AA/A A/g" temp2 | sed "s/AB/A B/g"| sed "s/BB/B B/g" | sed "s/BA/B A/g" | sed "s/NC/0 0/g"> temp3 | |
# Paste the two temporary files together to make the data/mouse.tped file | |
paste -d " " temp1 temp3 > data/mouse.tped | |
# Remove temporary files | |
rm temp1 temp2 temp3 | |
# 4. Convert to binary plink | |
# Plink can now read in the tped format data and convert it into a standard binary plink format | |
# Plink works by using 'flags' to tell it what you want it to do. A flag starts with --. In this case we are saying: | |
# --tfile data/mouse Read in the transposed file format, file name is data/mouse | |
# Because it knows it is transposed format you don't need to | |
# specify the actual file endings (.tped and .tfam), it looks | |
# for them automatically | |
# --make-bed Create a binary plink format file from the transposed file | |
# --out data/mouse Save the new format to this file name. It will create three | |
# files: .bed, .bim, .fam. | |
# --allow-extra-chr There are a few strange chomosome IDs at the moment. This | |
# flag tells plink to allow them. Normally it looks for 1-22 | |
# and X, Y, etc. Note - X chr is recoded as 23 | |
plink --tfile data/mouse --make-bed --out data/mouse --allow-extra-chr | |
# Now you have the data in binary plink format. Have a look at it: | |
head data/mouse.bim | |
# This is the SNP data - it shows chr, SNP, genetic position, physical position, allele1, allele2 | |
# To find how many SNPs there are: | |
wc -l data/mouse.bim | |
head data/mouse.fam | |
# This is the individual information - it shows family ID, individual ID, father ID, mother ID, sex, phenotype | |
# To find how many IDs there are: | |
wc -l data/mouse.fam | |
# Finally, there is the data/mouse.bed. This encodes the genotypes in compressed format, so you can't read them. If you are moving this data around make sure you move the .bed, .bim and .fam files all together, plink reads all three of them for any one analysis. | |
# 5. Obtain sex information | |
# Gonna predict the sex using the X chromosome data, but this might not be completely accurate. If you have full information on sex then better to use that data instead of this method | |
# - Use plink to infer sex from the X chromosome | |
# - Take the inferred sex from column 4 and put it in the .fam file | |
# Make a copy of the original fam file | |
cp data/mouse.fam data/mouse.fam.nosex | |
# Get plink to read in the binary plink format and guess the sex from the X chr | |
plink --bfile data/mouse --check-sex --out data/mouse --allow-extra-chr | |
# Extract the predicted sex column from the output | |
awk '{if(NR > 1) {print $4}}' data/mouse.sexcheck > temp1 | |
# Add that column to the end of the fam file | |
paste -d " " data/mouse.fam temp1 > temp2 | |
# Recreate the fam file with the correct sex column | |
awk '{print $1, $2, $3, $4, $7, $6}' temp2 > data/mouse.fam | |
# 6. Remove SNPs with weird chromosome codes | |
# This finds all lines that don't start with "1 ", "2 ", ..., "23 " and then uses awk to print the SNP IDs only | |
egrep -v '^[1-2]?[0-9]\s' data/mouse.bim | awk '{ print $2}' > data/mouse.removesnps | |
# Now use PLINK to read in the binary plink data, exclude the SNPs we identified with weird chromosomes, and save the data without those SNPs in a new file called data/mouse_clean.bed/.bim/.fam | |
plink --bfile data/mouse --exclude data/mouse.removesnps --make-bed --out data/mouse_clean --allow-extra-chr | |
# 7. Calculate allele frequencies | |
# Now you can use PLINK to do various things, such as calculating allele frequencies, or a lot of other analyses | |
plink --bfile data/mouse_clean --freq --out data/mouse_clean |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment