Let's process the PAF output from miniprot to get some stats:
for i in *paf; do python ../paf-stats.py -i ${i} -o ${i/paf/tsv} ; done
Let's process the PAF output from miniprot to get some stats:
for i in *paf; do python ../paf-stats.py -i ${i} -o ${i/paf/tsv} ; done
import argparse | |
import gzip | |
from Bio import SeqIO | |
from Bio.SeqFeature import SeqFeature, FeatureLocation | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
def extract_rrna_trna_features(input_file, output_file): | |
# Determine if the input file is gzipped | |
if input_file.endswith(".gz"): |
library(scholar) # to get publications and impact factors | |
library(stringr) # to modify text | |
library(cowplot) # for plotting | |
library(ggplot2) | |
library(ggrepel) | |
library(lemon) | |
library(dplyr) | |
# Set variables | |
Scholar_ID <- "wA7Hrk8AAAAJ" |
|
In our workflow, we utilize the distinct groups in which NCBI organizes their data. These groups can be found in column 25 of the assembly_summary.txt file, as described here. The groups are as follows:
library(tidyverse) | |
# Read in the data | |
setwd("/maps/projects/lundbeck/scratch/taxDB/v6/metadata/src_files") | |
ncbi_assm_stats <- list.files(".", pattern = "genome_metadata.txt", full.names = TRUE) | |
ncbi_assm_stats <- map_dfr(ncbi_assm_stats, function(X) { | |
read_tsv(X, col_names = TRUE) | |
}) %>% | |
select(-filename) %>% |
from tqdm import tqdm | |
from collections import defaultdict | |
import argparse | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from multiprocessing import Pool | |
import gzip | |
from itertools import zip_longest | |
from mimetypes import guess_type | |
from functools import partial | |
from cdifflib import CSequenceMatcher |