parsing IPD data and formatting it, continued

Description of Workflow

Step 3: Create a fasta file of exons that extends the window for mapping reads N base pairs in both directions, where N == the number of base pairs the for the sequencing length. For Illumina, this is usually 75 or 150.

Required parameters:
- ntype -> the type of input, possible options are:
  - rna -> indicates that the input sequenced read data is from rna/cDNA (mutually exclusive with dna)
  - dna -> indicates that the input sequenced read data is from dna (mutually exclusive with rna, do not enter dna for cDNA data)
- filteredfile -> the file containing the input sequenced reads that will be used. Must be in MHC.dat format

Other parameters:

flength -> the length of padding for the reads, the default is 75

        #!/usr/bin/python3
        import argparse
        import time

        def parsedAndCastRange(xList):
            # requires: string in the format '1..5', returns [1,5] as list of ints
            unparsedRangeAsList = xList.split('.')
            parsedRange = list(filter(lambda x: x!= '', unparsedRangeAsList))
            return [int(parsedRange[0]), int(parsedRange[1])]

        def parsedExonOutput(s,idx):
            fwdLen = idx
            revLen = -1*idx
            retList = []
            retListExtended = []
            for i in s:
                listOfValues = i
                exonSeq = i[-1]
                exon_ct = []
                exon_ct_list = []
                mxLen = len(listOfValues[3])
                for j in listOfValues[3]:
                    exon_ct.append(j[0])
                    exon_ct_list.append((j[1][0], j[1][1]))
                # indexTransition = 0
                indexTransition = 0
                checkForIntron = 0
                eList = []
                for x in range(0, mxLen, 1):
                    exonStartAndStop = exon_ct_list[x]
                    exonStart = exonStartAndStop[0]
                    if x == 0:
                        exonStart = exonStartAndStop[0] - 1
                    exonStop = exonStartAndStop[1] + 1
                    exonSequence = exonSeq[exonStart:exonStop]
                    indexTransition = exonStartAndStop[1]+1
                    # r = str(listOfValues[0]) + ':' + str(exon_ct[x]) + ',' + exonSequence
                    r = str(listOfValues[0]) + ':' + str(exon_ct[x])
                    eList.append((r, exonSequence))
                retList.append(eList)
            for i in range(0, len(retList)):
                iLength = len(retList[i])
                seqIntAsString, seqStringList = zip(*retList[i])
                stringOfAllSeqs = ''.join(seqStringList)
                rList = []
                for j in range(0, iLength):
                    currentStringFromList = seqStringList[j]
                    seqID = seqIntAsString[j]
                    if j == 0:
                        cLoc = len(currentStringFromList)
                        endLoc = cLoc + fwdLen
                        endLoc = endLoc - 1
                        paddedString = stringOfAllSeqs[0:endLoc]
                        rList.append((seqID,paddedString))
                    elif j == iLength-1:
                        endString = seqStringList[j-1] + seqStringList[j]
                        paddedString_I = stringOfAllSeqs.find(endString) - fwdLen
                        startPos = paddedString_I - fwdLen
                        paddedStringIdx = stringOfAllSeqs.find(endString) + len(endString)
                        paddedStringIdx = paddedStringIdx
                        paddedString = stringOfAllSeqs[paddedString_I:paddedStringIdx]
                        rList.append((seqID,paddedString))
                    else:
                        paddedString_Ipos = stringOfAllSeqs.find(str(seqStringList[j-1]))
                        startCheck = paddedString_Ipos - fwdLen
                        fwdLenPadded = stringOfAllSeqs.find(str(seqStringList[j+1])) + fwdLen
                        if len(seqStringList[j+1]) < 10:
                            endString = seqStringList[j-1] + seqStringList[j]
                            fwdLenPadded = stringOfAllSeqs.find(endString) + len(endString) + fwdLen
                        # stopCheck = fwdLenPadded - len(stringOfAllSeqs)
                        startPos = startCheck
                        stopPos = fwdLenPadded
                        # scenario 1: fwdLen > strings before currentString
                        if startCheck < 0:
                            startPos = 0
                        # scenario 2: fwdLen > strings after currentString
                        if fwdLenPadded > len(stringOfAllSeqs):
                            stopPos = len(stringOfAllSeqs)
                            # NOTE: this is a bit of a hack, and assumes all sequences after index 3 will have decreasing length
                            # create a better one when not as sleepy
                        # scenario 3 pos(prevString) < fwdLen < pos(nextString): keep values unchanged
                        paddedString = stringOfAllSeqs[startPos:stopPos]
                        rList.append((seqID,paddedString))
                retListExtended.append(rList)
            return retListExtended

        if __name__ == "__main__":
            parser = argparse.ArgumentParser(description='parser for MHC file from EMBL')
            parser.add_argument('-f', '--filteredfile', help='filtered file from EMBL', required=True)
            parser.add_argument('ntype', choices=['rna', 'dna'], help='Enter either \'rna\' for rna/cDNA input, or \'dna\' for dna input')
            parser.add_argument('-i', '--flength', help='read length', required=True)
            args = parser.parse_args()


            alleleList = []
            with open(args.filteredfile, 'r') as f:
                s_id = ''
                s_alleleID = ''
                s_exonList = []
                alleleExonRange = []
                alleleExonSeq = []
                exonSeqs = ''
                exonCt = 1
                seqFlag = False
                for i in f:
                    i = i.rstrip('\r\n')
                    sLine = i.split(' ')
                    sLineList = list(filter(lambda x: x != '', sLine))
                    if i[0:2] == 'KW':
                        s_id = sLineList[1]
                    elif i[0:2] == 'ID':
                        s_alleleID = sLineList[1]
                    elif i[0:2] == 'FT':
                        if sLineList[1] == 'allele':
                            alleleExonRange = parsedAndCastRange(sLineList[2])
                        elif sLineList[1] == 'exon':
                            exonSeqRange = parsedAndCastRange(sLineList[2])
                            exonN = 'exon' + str(exonCt)
                            alleleExonSeq.append((exonN, exonSeqRange))
                            exonCt += 1
                        else:
                            continue
                    elif i[0:2] == 'SQ':
                        seqFlag = True
                        continue
                    else:
                        if seqFlag:
                            if i[0:2] == '//':
                                seqFlag = False
                                alleleList.append([s_id, s_alleleID, alleleExonRange, alleleExonSeq, exonSeqs])
                                alleleExonSeq = []
                                exonCt = 1
                                exonSeqs = ''
                                continue
                            else:
                                s = ''.join(sLineList[:-1])
                                exonSeqs += s
                        else:
                            continue
                            
            if args.outputType == 'exonList':
                res = parsedExonOutput(alleleList, args.flength)
                for i in res:
                    print(i)
            else:
                res = parsedExonOutput(alleleList, args.flength)
                for i in res:
                    for j in i:
                        n = i[0]
                        v = i[1]
                        hString = '>' + str(n)
                        seqString = str(v)
                        print(hString + '\n' + seqString)

disulfidebond/parse_mhc_workflow_pt3.md

Select an option

No results found

Select an option

No results found

Description of Workflow

Step 3: Create a fasta file of exons that extends the window for mapping reads N base pairs in both directions, where N == the number of base pairs the for the sequencing length. For Illumina, this is usually 75 or 150.