Skip to content

Instantly share code, notes, and snippets.

@chasemc
Created June 29, 2023 14:10
Show Gist options
  • Save chasemc/74292ae0808f2a9fb7843ebec4b93bda to your computer and use it in GitHub Desktop.
Save chasemc/74292ae0808f2a9fb7843ebec4b93bda to your computer and use it in GitHub Desktop.
import base64
import hashlib
from Bio.SeqUtils import CheckSum
import sys
def sha512t24u(input):
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7714221/
# To standardize to caps-only input, use hash_aminos()
sha512_digest = hashlib.sha512(bytes(input,"utf8")).digest()[:24]
sha512t24u = base64.urlsafe_b64encode(sha512_digest).decode("ascii")
return sha512t24u
def hash_aminos(input, **kwargs):
# make sure everything is uppercase before hashing
return hasher(input=input.upper(), **kwargs)
def use_hashlib(input, algo):
allow_algos = ("sha512", "sha256", "sha384", "md5", "sha224")
if algo not in allow_algos:
# TODO: this should also output the sha512t24u ans crc algos
raise ValueError(f"algo must be one of: {allow_algos}")
hasher = getattr(hashlib, algo)
return hasher(bytes(input, "utf8")).hexdigest()
def hasher(input, algo):
match algo:
case "sha512t24u":
return sha512t24u(input)
case "crc64":
return CheckSum.crc64(input).removeprefix("CRC-")
case "crc32":
return CheckSum.crc32(input)
case "seguid":
return CheckSum.seguid(input)
case _:
return use_hashlib(input=input, algo=algo)
def do_the_thing(id, input):
return(
id,
str(hasher(input,"seguid")),
str(hasher(input,"crc64")),
str(hasher(input,"crc32")),
str(hasher(input,"md5")),
str(hasher(input,"sha512")),
str(hasher(input,"sha512t24u")),
)
import fileinput
with open(sys.argv[1], "w") as h:
current_id=""
current_aa=""
for line in fileinput.input():
if 'Exit' == line.rstrip():
break
if line.startswith(">"):
if current_id:
tmp="\t".join(do_the_thing(current_id, current_aa))
h.write(tmp + "\n")
current_id=""
current_aa=""
current_id=line.split(" " ,1)[0].removeprefix(">")
else:
current_aa= current_aa + line.strip()
#!/bin/bash
getit() {
curl "https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniparc/fasta/active/uniparc_active_p{$1}.fasta.gz" |\
zcat |
python3 /media/socialgene_nvme/hashing_uniparc.py /media/socialgene_nvme/hash_uniprot/$1.tsv -
}
export -f getit
parallel -j 12 getit ::: $(seq 1 128)
#!/bin/bash
runit() {
cut -f $1 /media/socialgene_nvme/hash_uniprot/uniparc_hash_test.tsv | sort | uniq | wc -l > /media/socialgene_nvme/hash_uniprot/results/size_${1}
}
export -f runit
parallel -j 7 runit ::: $(seq 1 7)
wc -l /media/socialgene_nvme/hash_uniprot/uniparc_hash_test.tsv > /media/socialgene_nvme/hash_uniprot/results/size_truth
@chasemc
Copy link
Author

chasemc commented Jun 29, 2023

Note: this will download all of UniParc's FASTA files and, while it stream-processes it, the resulting TSV file is 128 GB

The following are the counts of unique values in each column of the TSV and the id it represents:
size_1: 517621195 UniParc_id
size_2: 517621195 seguid
size_3: 517619491 crc64
size_4: 487639570 crc32
size_5: 517621195 md5
size_6: 517621195 sha512
size_7: 517621195 sha512t24u
size_truth: 517621195 uniparc_hash_test.tsv

@chasemc
Copy link
Author

chasemc commented Jun 29, 2023

Ran on June 28, 2023 (2023-05-03 UniParc release)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment