mgalardini · December 17, 2015 10:58 · peterjc · May 17, 2013 · mgalardini · May 20, 2013
diff --git a/removeCopies b/removeCopies
 #!/usr/bin/python
 '''
 Reads a FASTA files and rewrites it keeping only the unique IDs

 Important: we assume that if two proteins have the same ID, they have the same sequence
 '''
 import sys
 from Bio import SeqIO

 if len(sys.argv) < 3:
    print 'Usage: removeCopies INPUT OUTPUT'
    sys.exit(1)

 fname, out = sys.argv[1:3]

 def unique(records):
    count = 0
    already = set()
    for r in records:
        count += 1
        if r.id not in already:
            already.add(r.id)
            yield r
    print "%i saved sequences, %i inputs" % (len(already), count)

 SeqIO.write(unique(SeqIO.parse(fname, "fasta")), out, "fasta")
	#!/usr/bin/python
	'''
	Reads a FASTA files and rewrites it keeping only the unique IDs

	Important: we assume that if two proteins have the same ID, they have the same sequence
	'''
	import sys
	from Bio import SeqIO

	if len(sys.argv) < 3:
	print 'Usage: removeCopies INPUT OUTPUT'
	sys.exit(1)

	fname, out = sys.argv[1:3]

	def unique(records):
	count = 0
	already = set()
	for r in records:
	count += 1
	if r.id not in already:
	already.add(r.id)
	yield r
	print "%i saved sequences, %i inputs" % (len(already), count)

	SeqIO.write(unique(SeqIO.parse(fname, "fasta")), out, "fasta")