Created
June 29, 2023 14:10
-
-
Save chasemc/74292ae0808f2a9fb7843ebec4b93bda to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import hashlib | |
from Bio.SeqUtils import CheckSum | |
import sys | |
def sha512t24u(input): | |
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7714221/ | |
# To standardize to caps-only input, use hash_aminos() | |
sha512_digest = hashlib.sha512(bytes(input,"utf8")).digest()[:24] | |
sha512t24u = base64.urlsafe_b64encode(sha512_digest).decode("ascii") | |
return sha512t24u | |
def hash_aminos(input, **kwargs): | |
# make sure everything is uppercase before hashing | |
return hasher(input=input.upper(), **kwargs) | |
def use_hashlib(input, algo): | |
allow_algos = ("sha512", "sha256", "sha384", "md5", "sha224") | |
if algo not in allow_algos: | |
# TODO: this should also output the sha512t24u ans crc algos | |
raise ValueError(f"algo must be one of: {allow_algos}") | |
hasher = getattr(hashlib, algo) | |
return hasher(bytes(input, "utf8")).hexdigest() | |
def hasher(input, algo): | |
match algo: | |
case "sha512t24u": | |
return sha512t24u(input) | |
case "crc64": | |
return CheckSum.crc64(input).removeprefix("CRC-") | |
case "crc32": | |
return CheckSum.crc32(input) | |
case "seguid": | |
return CheckSum.seguid(input) | |
case _: | |
return use_hashlib(input=input, algo=algo) | |
def do_the_thing(id, input): | |
return( | |
id, | |
str(hasher(input,"seguid")), | |
str(hasher(input,"crc64")), | |
str(hasher(input,"crc32")), | |
str(hasher(input,"md5")), | |
str(hasher(input,"sha512")), | |
str(hasher(input,"sha512t24u")), | |
) | |
import fileinput | |
with open(sys.argv[1], "w") as h: | |
current_id="" | |
current_aa="" | |
for line in fileinput.input(): | |
if 'Exit' == line.rstrip(): | |
break | |
if line.startswith(">"): | |
if current_id: | |
tmp="\t".join(do_the_thing(current_id, current_aa)) | |
h.write(tmp + "\n") | |
current_id="" | |
current_aa="" | |
current_id=line.split(" " ,1)[0].removeprefix(">") | |
else: | |
current_aa= current_aa + line.strip() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
getit() { | |
curl "https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniparc/fasta/active/uniparc_active_p{$1}.fasta.gz" |\ | |
zcat | | |
python3 /media/socialgene_nvme/hashing_uniparc.py /media/socialgene_nvme/hash_uniprot/$1.tsv - | |
} | |
export -f getit | |
parallel -j 12 getit ::: $(seq 1 128) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
runit() { | |
cut -f $1 /media/socialgene_nvme/hash_uniprot/uniparc_hash_test.tsv | sort | uniq | wc -l > /media/socialgene_nvme/hash_uniprot/results/size_${1} | |
} | |
export -f runit | |
parallel -j 7 runit ::: $(seq 1 7) | |
wc -l /media/socialgene_nvme/hash_uniprot/uniparc_hash_test.tsv > /media/socialgene_nvme/hash_uniprot/results/size_truth |
Ran on June 28, 2023 (2023-05-03 UniParc release)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: this will download all of UniParc's FASTA files and, while it stream-processes it, the resulting TSV file is 128 GB
The following are the counts of unique values in each column of the TSV and the id it represents:
size_1: 517621195 UniParc_id
size_2: 517621195 seguid
size_3: 517619491 crc64
size_4: 487639570 crc32
size_5: 517621195 md5
size_6: 517621195 sha512
size_7: 517621195 sha512t24u
size_truth: 517621195 uniparc_hash_test.tsv