Created
April 26, 2024 22:46
-
-
Save theosanderson/2ba5429aeeec9cfd5b4f2e0eb5c7c26a to your computer and use it in GitHub Desktop.
Translating GISAID named files to EPI_ISL files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example of how to run: tar -xJOf sequences_fasta_2024_04_25.tar.xz sequences.fasta | python translate.py | pv -l | zstd > seqs.fa.zst | |
import sys | |
import tarfile | |
import pandas as pd | |
def load_virus_to_accession_dict(tar_path): | |
""" Load the virus to accession ID dictionary from a tar.xz file containing a .tsv. """ | |
with tarfile.open(tar_path, 'r:xz') as tar: | |
# Find the tsv file within the archive, assuming there's only one .tsv | |
tsv_file = [member for member in tar.getmembers() if member.name.endswith('.tsv')][0] | |
# Extract and read the tsv file | |
with tar.extractfile(tsv_file) as file: | |
df = pd.read_csv(file, sep='\t', usecols=['Virus name', 'Accession ID']) | |
return df.set_index('Virus name')['Accession ID'].to_dict() | |
def process_fasta(virus_to_accession): | |
""" Process FASTA data from stdin, map names using the dictionary, and output to stdout. """ | |
current_name = '' | |
sequence_data = [] | |
for line in sys.stdin: | |
line = line.strip() | |
if line.startswith('>'): # New sequence header | |
# Output previous sequence if it exists | |
if current_name: | |
print(f'>{virus_to_accession.get(current_name, current_name)}') | |
print(''.join(sequence_data)) | |
sequence_data = [] | |
# Update current sequence name | |
current_name = line[1:] # Remove '>' | |
# remove from "|" to the end of the line | |
current_name = current_name.split('|')[0] | |
else: | |
sequence_data.append(line) | |
# Output the last sequence | |
if current_name: | |
print(f'>{virus_to_accession.get(current_name, current_name)}') | |
print(''.join(sequence_data)) | |
def main(): | |
# Path to your .tar.xz file containing the .tsv | |
tar_path = 'metadata_tsv_2024_04_25.tar.xz' | |
# Load dictionary | |
virus_to_accession = load_virus_to_accession_dict(tar_path) | |
# Process FASTA data | |
process_fasta(virus_to_accession) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment