Last active
September 13, 2018 22:32
-
-
Save pgsin/a4c175aedafac4295b5175aedfaf50fa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
namespace ImmunopepModel | |
{ | |
public class Gene | |
{ | |
public string Id { private set; get; } | |
public string[] Proteins { private set; get; } | |
public string[] Sequences { private set; get; } | |
public double HalfLife { set; get; } | |
public double Expression { set; get; } | |
public PeptideRecord[] Peptides { set; get; } | |
public Gene(string id, string[] proteins, string[] sequences) | |
{ | |
Id = id; | |
int imax = 0; | |
for (int i = 1; i < proteins.Length; i++) | |
{ | |
if (sequences[imax].Length < sequences[i].Length) imax = i; | |
} | |
if (imax != 0) | |
{ | |
string tmp; | |
tmp = sequences[imax]; | |
sequences[imax] = sequences[0]; | |
sequences[0] = tmp; | |
tmp = proteins[imax]; | |
proteins[imax] = proteins[0]; | |
proteins[0] = tmp; | |
} | |
Proteins = proteins; | |
Sequences = sequences; | |
HalfLife = 0.0; | |
Expression = 0.0; | |
} | |
} | |
public class FastaRecord | |
{ | |
public string GeneId { private set; get; } | |
public string ProtId { private set; get; } | |
public string Sequence { private set; get; } | |
public static Regex ProtRegExp = new Regex(@">([^\s]*)"); | |
public static Regex GeneRegExp = new Regex(@"gene:([^\s]*)"); | |
public FastaRecord(string geneId, string protId, string sequence) | |
{ | |
GeneId = geneId; | |
ProtId = protId; | |
Sequence = sequence; | |
} | |
} | |
public class PeptideRecord | |
{ | |
public string Sequence { private set; get; } | |
public int[] Msms { private set; get; } | |
public double Score { private set; get; } | |
public PeptideRecord(string sequence, int[] msmscounts, double score) | |
{ | |
Sequence = sequence; | |
Msms = msmscounts; | |
Score = score; | |
} | |
} | |
public class MasterMap | |
{ | |
public char[] Seq { private set; get; } | |
public byte[] Cov { private set; get; } | |
public int[] Gene { private set; get; } | |
public int Len => Seq.Length; | |
public MasterMap(char[] seq, byte[] coverage, int[] gene) | |
{ | |
Seq = seq; | |
Cov = coverage; | |
Gene = gene; | |
} | |
} | |
class Program | |
{ | |
public const int MinLength = 8; | |
public const int MaxLength = 13; | |
public const int Flanking = 5; | |
static void Main(string[] args) | |
{ | |
Random r = new Random(123); | |
string fasta_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Homo_sapiens.GRCh38.pep.all.fa"; | |
string transcript_rpkm_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\transcriptome.rpkm.txt"; | |
string half_life_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Savitski2018.processed.txt"; | |
string peptides_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.txt"; | |
string output_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.validation.abelin.txt"; | |
Dictionary<string, FastaRecord> fastaRecords = ReadFasta(fasta_file); | |
Dictionary<string, Gene> genes = new Dictionary<string, Gene>(); | |
GetGene2Prots(fastaRecords, genes); | |
ReadTranscript(transcript_rpkm_file, genes); | |
ReadHalfLife(half_life_file, genes); | |
string[] hlaNames; | |
ReadPeptide(peptides_file, genes, fastaRecords, out hlaNames); | |
//UpdateHlaNames(hlaNames); | |
using (StreamWriter sw = new StreamWriter(output_file)) | |
{ | |
WriteHeader(sw); | |
int numRecords = WriteTargets(sw, genes, hlaNames, out HashSet<string> peptideSet); | |
WriteDecoys(sw, genes, hlaNames, 100 * numRecords, r, peptideSet); | |
} | |
} | |
private static void UpdateHlaNames(string[] hlaNames) { | |
for (int i = 0; i < hlaNames.Length; i++) { | |
string s = hlaNames[i]; | |
hlaNames[i] = $"HLA-{s.Substring(0, 3)}:{s.Substring(3, 2)}"; | |
} | |
} | |
private static void WriteHeader(StreamWriter sw) | |
{ | |
sw.WriteLine(string.Join("\t", | |
new string[] { | |
"sequence", "index", "target(+) or decoy(-) abelin", "N-seq", "N-pos", "C-seq", "C-pos", "proteinId", "geneId", | |
"MHC identified", "MHC msms count", "Gene expression", "Mean Half-Life Savitski18" | |
})); | |
} | |
private static void WriteDecoys(StreamWriter sw, Dictionary<string, Gene> genes, string[] hlaNames, int numRecords, Random r, | |
HashSet<string> peptideSet) | |
{ | |
List<KeyValuePair<Gene, int>>[] res = new List<KeyValuePair<Gene, int>>[MaxLength - MinLength + 1]; | |
for (int i = 0; i <= MaxLength - MinLength; i++) | |
{ | |
res[i] = new List<KeyValuePair<Gene, int>>(); | |
} | |
foreach (var gene in genes) | |
{ | |
string s = gene.Value.Sequences[0]; | |
bool[] b = new bool[s.Length]; | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
int i = s.IndexOf(peptide.Sequence); | |
if (i != -1) | |
{ | |
for (int j = 0; j < peptide.Sequence.Length; j++) | |
{ | |
b[j + i] = true; | |
} | |
} | |
} | |
for (int i = 0; i <= b.Length - MinLength; i++) | |
{ | |
for (int j = 0; j <= MaxLength - MinLength; j++) | |
{ | |
if (i + j + MinLength > b.Length) break; | |
int sum = 0; | |
for (int k = i; k < i + j + MinLength; k++) if (b[k]) sum++; | |
if (sum > ((MinLength + j) >> 1)) continue; | |
res[j].Add(new KeyValuePair<Gene, int>(gene.Value, i)); | |
} | |
} | |
} | |
double[][] hlaLenDist = GetLengthDist(genes); //lengths x hlas | |
KeyValuePair<Gene, int>[] result; | |
for (int i = 0; i < MaxLength - MinLength + 1; i++) | |
{ | |
int pepLength = i + MinLength; | |
result = res[i].OrderBy(x => r.Next()).ToArray(); | |
int l = 0; | |
for (int j = 0; j < hlaLenDist[i].Length; j++) | |
{ | |
int until = (int)(hlaLenDist[i][j] * numRecords); | |
int k = 0; | |
for (; k < until; k++) | |
{ | |
Gene g = result[l + k].Key; | |
int pos = result[l + k].Value; | |
string pepSeq = g.Sequences[0].Substring(pos, pepLength); | |
if (peptideSet.Contains(pepSeq) || pepSeq.IndexOf('U') != -1 ) | |
{ | |
until++; | |
continue; | |
} | |
else | |
{ | |
peptideSet.Add(pepSeq); | |
} | |
int mi = Math.Max(0, pos - Flanking); | |
int f = pos + MinLength + i; | |
int ma = Math.Min(f + Flanking, g.Sequences[0].Length); | |
sw.WriteLine( | |
string.Join("\t", | |
new string[] { | |
pepSeq, "0", "-", | |
g.Sequences[0].Substring(mi, pos - mi), pos.ToString(), | |
g.Sequences[0].Substring(f, ma - f), (g.Sequences[0].Length-f).ToString(), | |
g.Proteins[0], g.Id, | |
hlaNames[j], "0", | |
String.Format("{0:0.000}", g.Expression), String.Format("{0:0.000}", g.HalfLife) | |
})); | |
} | |
l += k; | |
} | |
} | |
} | |
private static int WriteTargets(StreamWriter sw, Dictionary<string, Gene> genes, string[] expNames, out HashSet<string> peptideSet) | |
{ | |
peptideSet = new HashSet<string>(); | |
int count = 0; | |
foreach (var gene in genes) | |
{ | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
string proteinseq = gene.Value.Sequences[0]; | |
string peptideseq = peptide.Sequence; | |
int i = proteinseq.IndexOf(peptideseq); | |
if (i == -1) continue; | |
int mi = Math.Max(0, i - Flanking); | |
int j = i + peptideseq.Length; | |
int ma = Math.Min(j + Flanking, proteinseq.Length); | |
List<string> mhc_identified = new List<string>(); | |
List<string> mhc_msms_identified = new List<string>(); | |
for (int k = 0; k < peptide.Msms.Length; k++) | |
{ | |
if (peptide.Msms[k] > 0) | |
{ | |
mhc_identified.Add(expNames[k]); | |
mhc_msms_identified.Add(peptide.Msms[k].ToString()); | |
} | |
} | |
for (int k = 0; k < mhc_identified.Count; k++) | |
{ | |
count++; | |
peptideSet.Add(peptideseq); | |
sw.WriteLine( | |
string.Join("\t", | |
new string[] { | |
peptideseq, k.ToString(), "+", | |
proteinseq.Substring(mi, i - mi), i.ToString(), | |
proteinseq.Substring(j, ma - j), (proteinseq.Length-j).ToString(), | |
gene.Value.Proteins[0], gene.Key, | |
mhc_identified[k], mhc_msms_identified[k], | |
String.Format("{0:0.000}", gene.Value.Expression), String.Format("{0:0.000}", gene.Value.HalfLife) | |
})); | |
} | |
} | |
} | |
return count; | |
} | |
private static double[][] GetLengthDist(Dictionary<string, Gene> genes) | |
{ | |
int len = genes.Values.First().Peptides[0].Msms.Length; | |
int[][] result = new int[MaxLength - MinLength + 1][]; | |
double[][] result0 = new double[MaxLength - MinLength + 1][]; | |
for (int i = 0; i < result.Length; i++) | |
{ | |
result[i] = new int[len]; | |
result0[i] = new double[len]; | |
} | |
foreach (var gene in genes) | |
{ | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
for (int i = 0; i < peptide.Msms.Length; i++) | |
{ | |
if (peptide.Msms[i] != 0) result[peptide.Sequence.Length - MinLength][i]++; | |
} | |
} | |
} | |
int s = 0; | |
for (int i = 0; i < result.Length; i++) | |
{ | |
for (int j = 0; j < result[i].Length; j++) | |
{ | |
s += result[i][j]; | |
} | |
} | |
for (int i = 0; i < result.Length; i++) | |
{ | |
for (int j = 0; j < result[i].Length; j++) | |
{ | |
result0[i][j] = (double)result[i][j] / s; | |
} | |
} | |
return result0; | |
} | |
private static void ReadPeptide(string peptides_file, Dictionary<string, Gene> genes, Dictionary<string, FastaRecord> fasta, | |
out string[] expNames) | |
{ | |
Dictionary<string, List<PeptideRecord>> tmpd = new Dictionary<string, List<PeptideRecord>>(); | |
List<PeptideRecord> tmpe; | |
using (StreamReader sr = new StreamReader(peptides_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int sequenceIndex = Array.IndexOf(header, "Sequence"); | |
int proteinsIndex = Array.IndexOf(header, "Proteins"); | |
int scoreIndex = Array.IndexOf(header, "Score"); | |
int reverseIndex = Array.IndexOf(header, "Reverse"); | |
int potentialIndex = Array.IndexOf(header, "Potential contaminant"); | |
Regex experimentRegExp = new Regex(@"Experiment (.*)"); | |
List<int> experimentTmpIndex = new List<int>(); | |
List<string> experimentTmpNames = new List<string>(); | |
int blankIndex = -1; | |
int hlaNegIndex = -1; | |
for (int i = 0; i < header.Length; i++) | |
{ | |
var x = experimentRegExp.Match(header[i]); | |
if (x.Success) | |
{ | |
if (x.Groups[1].Value == "Blank") | |
{ | |
blankIndex = i; | |
continue; | |
} | |
if (x.Groups[1].Value == "HLAneg") | |
{ | |
hlaNegIndex = i; | |
continue; | |
} | |
experimentTmpIndex.Add(i); | |
experimentTmpNames.Add(x.Groups[1].Value); | |
} | |
} | |
int[] experimentIndex = experimentTmpIndex.ToArray(); | |
expNames = experimentTmpNames.ToArray(); | |
string line; | |
FastaRecord f; | |
Gene g; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
string[] proteins = l[proteinsIndex].Split(';'); | |
Dictionary<string, Gene> genedict = new Dictionary<string, Gene>(); | |
for (int i = 0; i < proteins.Length; i++) | |
if (fasta.TryGetValue(proteins[0], out f) && genes.TryGetValue(f.GeneId, out g) && !genedict.ContainsKey(f.GeneId)) | |
genedict.Add(f.GeneId, g); | |
if (genedict.Count != 1) continue; //multiple-mapped peptide filter | |
g = genedict.Values.ToArray()[0]; | |
if (l[reverseIndex] != "+" && l[potentialIndex] != "+" && //reverse and potential filter | |
MinLength <= l[sequenceIndex].Length && l[sequenceIndex].Length <= MaxLength && //length filter | |
string.IsNullOrEmpty(l[blankIndex]) && string.IsNullOrEmpty(l[hlaNegIndex]) && //negative control filter | |
g.Expression > 0.0) //expression filter | |
{ | |
int[] msms = new int[experimentIndex.Length]; | |
for (int i = 0; i < experimentIndex.Length; i++) | |
{ | |
msms[i] = String.IsNullOrEmpty(l[experimentIndex[i]]) ? 0 : Convert.ToInt32(l[experimentIndex[i]]); | |
} | |
if (msms.Sum() <= 1) continue; | |
PeptideRecord pr = new PeptideRecord(l[sequenceIndex], msms, Convert.ToDouble(l[scoreIndex])); | |
if (tmpd.TryGetValue(g.Id, out tmpe)) | |
tmpe.Add(pr); | |
else | |
tmpd.Add(g.Id, new List<PeptideRecord> { pr }); | |
} | |
} | |
} | |
foreach (var i in tmpd) | |
{ | |
genes[i.Key].Peptides = i.Value.ToArray(); | |
} | |
List<string> rmList = new List<string>(); | |
foreach (var i in genes) | |
if (i.Value.Peptides == null) rmList.Add(i.Key); | |
foreach (var i in rmList) | |
genes.Remove(i); | |
} | |
private static void ReadHalfLife(string halflife_file, Dictionary<string, Gene> genes) | |
{ | |
Gene tmp; | |
using (StreamReader sr = new StreamReader(halflife_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int geneId = Array.IndexOf(header, "Gene stable ID"); | |
int mean = Array.IndexOf(header, "Mean"); | |
string line; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
if (genes.TryGetValue(l[geneId], out tmp)) | |
{ | |
tmp.HalfLife = Convert.ToDouble(l[mean]); | |
} | |
} | |
} | |
} | |
private static void ReadTranscript(string transcript_file, Dictionary<string, Gene> genes) | |
{ | |
using (StreamReader sr = new StreamReader(transcript_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int geneId = Array.IndexOf(header, "Gene id"); | |
int mean = Array.IndexOf(header, "Mean"); | |
Gene g; | |
string line; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
if (genes.TryGetValue(l[geneId], out g)) | |
{ | |
g.Expression = Convert.ToDouble(l[mean]); | |
} | |
} | |
} | |
} | |
private static void GetGene2Prots(Dictionary<string, FastaRecord> fastaRecords, Dictionary<string, Gene> genes) | |
{ | |
Dictionary<string, List<string>> tmp = new Dictionary<string, List<string>>(); | |
List<string> tmpValue; | |
foreach (var record in fastaRecords) | |
{ | |
if (tmp.TryGetValue(record.Value.GeneId, out tmpValue)) | |
{ | |
tmpValue.Add(record.Value.ProtId); | |
} | |
else | |
{ | |
tmp.Add(record.Value.GeneId, new List<string> { record.Value.ProtId }); | |
} | |
} | |
foreach (var t in tmp) | |
{ | |
List<string> sequences = new List<string>(); | |
foreach (string p in t.Value) | |
{ | |
sequences.Add(fastaRecords[p].Sequence); | |
} | |
genes.Add(t.Key, new Gene(t.Key, t.Value.ToArray(), sequences.ToArray())); | |
} | |
} | |
private static Dictionary<string, FastaRecord> ReadFasta(string fasta_file) | |
{ | |
Dictionary<string, FastaRecord> result = new Dictionary<string, FastaRecord>(); | |
using (StreamReader sr = new StreamReader(fasta_file)) | |
{ | |
string line; | |
List<string> buffer = new List<string>(); | |
while ((line = sr.ReadLine()) != null) | |
{ | |
if (line[0] == '>' && buffer.Count != 0) | |
{ | |
FastaRecord fr = new FastaRecord(FastaRecord.GeneRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0], | |
FastaRecord.ProtRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0], | |
string.Concat(buffer.Skip(1))); | |
if (!fr.Sequence.Contains('*') && fr.Sequence.Length > MaxLength) | |
{ | |
result[fr.ProtId] = fr; | |
} | |
buffer.Clear(); | |
} | |
buffer.Add(line); | |
} | |
} | |
return result; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment