Let's process the PAF output from miniprot to get some stats:
for i in *paf; do python ../paf-stats.py -i ${i} -o ${i/paf/tsv} ; doneLet's process the PAF output from miniprot to get some stats:
for i in *paf; do python ../paf-stats.py -i ${i} -o ${i/paf/tsv} ; done| import argparse | |
| import gzip | |
| from Bio import SeqIO | |
| from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| def extract_rrna_trna_features(input_file, output_file): | |
| # Determine if the input file is gzipped | |
| if input_file.endswith(".gz"): |
| library(scholar) # to get publications and impact factors | |
| library(stringr) # to modify text | |
| library(cowplot) # for plotting | |
| library(ggplot2) | |
| library(ggrepel) | |
| library(lemon) | |
| library(dplyr) | |
| # Set variables | |
| Scholar_ID <- "wA7Hrk8AAAAJ" |
| |
In our workflow, we utilize the distinct groups in which NCBI organizes their data. These groups can be found in column 25 of the assembly_summary.txt file, as described here. The groups are as follows:
| library(tidyverse) | |
| # Read in the data | |
| setwd("/maps/projects/lundbeck/scratch/taxDB/v6/metadata/src_files") | |
| ncbi_assm_stats <- list.files(".", pattern = "genome_metadata.txt", full.names = TRUE) | |
| ncbi_assm_stats <- map_dfr(ncbi_assm_stats, function(X) { | |
| read_tsv(X, col_names = TRUE) | |
| }) %>% | |
| select(-filename) %>% |
| from tqdm import tqdm | |
| from collections import defaultdict | |
| import argparse | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from multiprocessing import Pool | |
| import gzip | |
| from itertools import zip_longest | |
| from mimetypes import guess_type | |
| from functools import partial | |
| from cdifflib import CSequenceMatcher |