Skip to content

Instantly share code, notes, and snippets.

View chasemc's full-sized avatar
:octocat:

Chase Clark chasemc

:octocat:
View GitHub Profile
@chasemc
chasemc / download_all_refseq_gbff.py
Last active March 2, 2023 18:01
If you do this follow NCBI's guidelines of when to do large downloads. This script won't to check your available hard drive space and **RefSeq** alone will require > 1 Terabyte**
#!/usr/bin/env python
# Import Module
import argparse
import ftplib
from tqdm import tqdm
import requests
from pathlib import Path
import logging
import multiprocessing
import csv
@chasemc
chasemc / a
Last active March 2, 2023 16:21
# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date
GCF_000001215.4 PRJNA164 SAMN02803731 reference genome 7227 7227 Drosophila melanogaster latest Chromosome Major Full 2014/08/01 Release 6 plus ISO1 MT The FlyBase Consortium/Berkeley Drosophila Genome Project/Celera Genomics GCA_000001215.4 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT na
GCF_000001405.40 PRJNA168 reference genome 9606 9606 Homo sapiens latest Chromosome Patch Full 2022/02/03 GRCh38.p14 Genome Reference Consortium GCA_000001405.29 different https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001
#!/usr/bin/env python3
# script to find and download genomes associated with mibig
from pathlib import Path
import os
import hashlib
import requests
import csv
import argparse
from collections import defaultdict
from pathlib import Path
prompt_data = "/home/chase/Downloads/input.txt"
with open(prompt_data) as fp:
lines = fp.readlines()
cleaned=[]
@chasemc
chasemc / b
Created November 8, 2022 20:28
<?xml version='1.0' encoding='UTF-8'?>
<rdf:RDF xml:base="http://purl.uniprot.org/uniprot/" xmlns="http://purl.uniprot.org/core/" xmlns:ECO="http://purl.obolibrary.org/obo/ECO_" xmlns:annotation="http://purl.uniprot.org/annotation/" xmlns:citation="http://purl.uniprot.org/citations/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:disease="http://purl.uniprot.org/diseases/" xmlns:enzyme="http://purl.uniprot.org/enzyme/" xmlns:faldo="http://biohackathon.org/resource/faldo#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:go="http://purl.obolibrary.org/obo/GO_" xmlns:isoform="http://purl.uniprot.org/isoforms/" xmlns:keyword="http://purl.uniprot.org/keywords/" xmlns:location="http://purl.uniprot.org/locations/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:position="http://purl.uniprot.org/position/" xmlns:pubmed="http://purl.uniprot.org/pubmed/" xmlns:range="http://purl.uniprot.org/range/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:skos="htt
@chasemc
chasemc / a
Created November 8, 2022 20:23
This file has been truncated, but you can view the full file.
<?xml version='1.0' encoding='UTF-8'?>
<rdf:RDF xmlns="http://purl.uniprot.org/core/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:skos="http://www.w3.org/2004/02/skos/core#">
<owl:Ontology rdf:about="">
<owl:imports rdf:resource="http://purl.uniprot.org/core/"/>
</owl:Ontology>
<rdf:Description rdf:about="http://purl.uniprot.org/enzyme/1.-.-.-">
<rdf:type rdf:resource="http://purl.uniprot.org/core/Enzyme"/>
<skos:prefLabel>Oxidoreductases</skos:prefLabel>
<rdfs:subClassOf rdf:resource="http://purl.uniprot.org/core/Enzyme"/>
<skos:narrowerTransitive rdf:resource="http://purl.uniprot.org/enzyme/1.1.-.-"/>
#!/usr/bin/env Rscript
if (!require("remotes")) {
install.packages("remotes")
library(remotes)
}
if (!require("DBI")) {
install.packages("DBI")
library(DBI)
}
if (!require("RSQLite")) {
This file has been truncated, but you can view the full file.
HMMER3/f [3.1b2 | February 2015]
NAME adh_short
ACC PF00106.28
DESC short chain dehydrogenase
LENG 195
ALPH amino
RF no
MM no
CONS yes
CS yes
@chasemc
chasemc / skimr.r
Created September 20, 2022 18:26
library(data.table)
library(skimr)
temp <- fread(
"curl -s https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt | head -n 20",
sep = "\t",
header = T
)
skim(temp)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.