Created
November 20, 2023 06:30
-
-
Save Ge0rges/16004bdbe358e8d90832685574d323da to your computer and use it in GitHub Desktop.
Downloads the latest release of Defense Finder HMM models and create an anvio compatible `hmm-source`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Set the GitHub repository and destination folder | |
github_repo="mdmparis/defense-finder-models" | |
destination_folder="DefenseFinder_anvio/" | |
source_folder="defense-finder-models" | |
# Function to check if a file exists, and exit with a warning if it does | |
check_file_exists() { | |
if [ -e "$1" ]; then | |
echo "Warning: File $1 already exists. Exiting." | |
exit 1 | |
fi | |
} | |
# Function to download the latest release from GitHub | |
download_latest_release() { | |
repo_url="https://api.github.com/repos/$1/releases/latest" | |
download_url=$(curl -s "$repo_url" | grep "browser_download_url" | cut -d '"' -f 4) | |
wget -O latest_release.tar.gz "$download_url" | |
tar -xzf latest_release.tar.gz | |
rm latest_release.tar.gz | |
} | |
# Check if any of the required files already exist | |
check_file_exists "$destination_folder/genes.hmm" | |
check_file_exists "$destination_folder/kind.txt" | |
check_file_exists "$destination_folder/reference.txt" | |
check_file_exists "$destination_folder/target.txt" | |
check_file_exists "$destination_folder/noise_cutoff_terms.txt" | |
check_file_exists "$destination_folder/genes.txt" | |
# Download the latest release from GitHub | |
download_latest_release "$github_repo" | |
# Create the destination folder if it doesn't exist | |
mkdir -p "$destination_folder" | |
# Create kind.txt with "DefenseFinder" | |
echo "DefenseFinder" > "$destination_folder/kind.txt" | |
# Extract version number from metadata.yml | |
version_number=$(grep "vers" "$source_folder/metadata.yml" | cut -d ' ' -f2) | |
# Create reference.txt with "DefenseFinder" and version number | |
echo "DefenseFinder $version_number" > "$destination_folder/reference.txt" | |
# Create target.txt with "GENE:AA" | |
echo "AA:GENE" > "$destination_folder/target.txt" | |
# Create noise_cutoff_terms.txt with "-E 1e-25" | |
echo "-E 1e-25" > "$destination_folder/noise_cutoff_terms.txt" | |
# Create genes.txt with "gene accession hmmsource" | |
echo -e "gene\taccession\thmmsource" > "$destination_folder/genes.txt" | |
for file in "$source_folder/profiles"/*.hmm; do | |
base_name=$(basename "$file" .hmm) | |
# In the current version of the HMM profiles, some are concatenanted profiles with the same NAME field. | |
# Anvio does not like this, nor should it. Thus here, I append the NAME field with an incrementing suffix. | |
# In future versions where this is fixed, this code will be obsolete but shouldn't break anything. | |
# Extract gene names and count occurrences within the file | |
gene_counts=$(awk '/^NAME/ {if (++count[$2] == 1) print $2; else print $2"_"count[$2]}' "$file") | |
# Loop through each gene name and create a separate row in genes.txt | |
while read -r final_gene_name; do | |
echo -e "$final_gene_name\t$base_name\thttps://github.com/mdmparis/defense-finder-models" >> "$destination_folder/genes.txt" | |
done <<< "$gene_counts" | |
# Modify the NAME field in the .hmm file | |
awk '/^NAME/ {if (++count[$2] == 1) print $1, $2; else print $1, $2"_"count[$2]} !/^NAME/ {print $0}' "$file" > "$file.tmp" && mv "$file.tmp" "$file" | |
done | |
# Concatenate .hmm files into genes.hmm | |
cat "$source_folder/profiles"/*.hmm > "$destination_folder/genes.hmm" | |
# Compress genes.hmm using gzip | |
gzip "$destination_folder/genes.hmm" | |
# Remove the source folder | |
rm -r "$source_folder" | |
# Display completion message | |
echo "Script completed successfully. Files are in $destination_folder." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note that in
genes.txt
the accession for the gene is simply the file name of the HMM. No accession is provided with these profiles unfortunately. Check out MicrobeMod for how you might BLAST the result to identify the precise protein for restriction-modification systems.