Skip to content

Instantly share code, notes, and snippets.

@Ge0rges
Created March 15, 2024 02:13
Show Gist options
  • Save Ge0rges/594f88f09d17b5d24cc0051f65c491e5 to your computer and use it in GitHub Desktop.
Save Ge0rges/594f88f09d17b5d24cc0051f65c491e5 to your computer and use it in GitHub Desktop.
Make the MDMParis defense finder models compatible with Anvi'o.
#!/bin/bash
# Set the GitHub repository and destination folder
github_repo="mdmparis/defense-finder-models"
destination_folder="DefenseFinder/"
source_folder="defense-finder-models"
# Function to check if a file exists, and exit with a warning if it does
check_file_exists() {
if [ -e "$1" ]; then
echo "Warning: File $1 already exists. Exiting."
exit 1
fi
}
# Function to download the latest release from GitHub
download_latest_release() {
repo_url="https://api.github.com/repos/$1/releases/latest"
download_url=$(curl -s "$repo_url" | grep "browser_download_url" | cut -d '"' -f 4)
wget -O latest_release.tar.gz "$download_url"
tar -xzf latest_release.tar.gz
rm latest_release.tar.gz
}
# Check if any of the required files already exist
check_file_exists "$destination_folder/genes.hmm"
check_file_exists "$destination_folder/kind.txt"
check_file_exists "$destination_folder/reference.txt"
check_file_exists "$destination_folder/target.txt"
check_file_exists "$destination_folder/noise_cutoff_terms.txt"
check_file_exists "$destination_folder/genes.txt"
# Download the latest release from GitHub
download_latest_release "$github_repo"
# Create the destination folder if it doesn't exist
mkdir -p "$destination_folder"
# Create kind.txt with "DefenseFinder"
echo "DefenseFinder" > "$destination_folder/kind.txt"
# Extract version number from metadata.yml
version_number=$(grep "vers" "$source_folder/metadata.yml" | cut -d ' ' -f2)
# Create reference.txt with "DefenseFinder" and version number
echo "DefenseFinder $version_number" > "$destination_folder/reference.txt"
# Create target.txt with "AA:GENE"
echo "AA:GENE" > "$destination_folder/target.txt"
# Create noise_cutoff_terms.txt with "-E 1e-25"
echo "-E 1e-25" > "$destination_folder/noise_cutoff_terms.txt"
# Create genes.txt with "gene accession hmmsource"
echo -e "gene\taccession\thmmsource" > "$destination_folder/genes.txt"
for file in "$source_folder/profiles"/*.hmm; do
base_name=$(basename "$file" .hmm)
# Extract gene names and count occurrences within the file
gene_counts=$(awk '/^NAME/ {if (++count[$2] == 1) print $2; else print $2"_"count[$2]}' "$file")
# Loop through each gene name and create a separate row in genes.txt
while read -r final_gene_name; do
echo -e "$final_gene_name\t$base_name\thttps://github.com/mdmparis/defense-finder-models" >> "$destination_folder/genes.txt"
done <<< "$gene_counts"
# Modify the NAME field in the .hmm file
awk '/^NAME/ {if (++count[$2] == 1) print $1, $2; else print $1, $2"_"count[$2]} !/^NAME/ {print $0}' "$file" > "$file.tmp" && mv "$file.tmp" "$file"
done
# Concatenate .hmm files into genes.hmm
cat "$source_folder/profiles"/*.hmm > "$destination_folder/genes.hmm"
# Compress genes.hmm using gzip
gzip "$destination_folder/genes.hmm"
# Delete
rm -r $source_folder
# Display completion message
echo "Script completed successfully. Files are in $destination_folder."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment