Skip to content

Instantly share code, notes, and snippets.

View apcamargo's full-sized avatar
🦖

Antônio Camargo apcamargo

🦖
  • University of São Paulo
  • São Paulo, SP, Brazil
  • X @apcamargo_
View GitHub Profile
from bisect import bisect_left, bisect_right
from collections import defaultdict
from math import floor, sqrt
def iter_similar_sets(a, b, metric="jaccard", min_similarity=0.0):
"""
Yield pairs of dictionary keys whose set similarity meets a threshold.
Parameters
#!/usr/bin/env python
import argparse
import math
import sys
from dataclasses import dataclass
from pathlib import Path
ANY_BACK = 1e-5
#!/usr/bin/env python
import shutil
from pathlib import Path
from typing import Literal
import pyarrow as pa
import pyarrow.parquet as pq
@apcamargo
apcamargo / find_cutoff.py
Last active October 11, 2025 19:52
Automatic cutoff determination for an arbitrary distribution
from typing import Sequence
import math
def find_cutoff(values: Sequence[float]) -> float:
"""
Determine the cutoff point in a biphasic distribution curve by identifying
the "bending point" where the curve transitions from slowly growing values
to rapidly growing values, using the maximum perpendicular distance method.
@apcamargo
apcamargo / query_sra_duckdb.sh
Created July 13, 2025 21:11
Query SRA metadata stored as Parquet files in S3 using DuckDB
duckdb -c "
INSTALL httpfs;
LOAD httpfs;
INSTALL parquet;
LOAD parquet;
COPY (
SELECT *
FROM read_parquet('s3://sra-pub-metadata-us-east-1/sra/metadata/*')
) TO STDOUT WITH (FORMAT CSV, DELIMITER E'\t', HEADER);"
@apcamargo
apcamargo / download_mg_rast.py
Last active July 12, 2025 21:20
Downloads all the assembled metagenomes available in MG-RAST
#!/usr/bin/env python
import json
import re
import sys
from typing import Generator, Dict, Any, Optional
import requests
from tqdm import tqdm
from pathlib import Path
from typing import Iterator, Optional, Union
import polars as pl
from needletail import parse_fastx_file
from polars.io.plugins import register_io_source
def scan_fastx(fastx_file: Union[str, Path]) -> pl.LazyFrame:
schema = pl.Schema(
@apcamargo
apcamargo / sam2tsv.py
Created March 10, 2025 03:53
Converts alignments stored in the SAM format to a BLAST-like table
#!/usr/bin/env python
"""
This script processes SAM (Sequence Alignment/Map format) inputs from standard
input and extracts alignment information that is then provided in a tab-separated
table. The following fields are produced: query, target, query_length, query_start,
query_end, target_start, target_end, alignment_length, alignment_identity.
This script was designed for use with SAM files produced by minimap2. However,
it will work with any SAM data that:
@apcamargo
apcamargo / calculate_neff.py
Created November 17, 2024 04:59
Calculate the number of effective sequences (Neff) of a A3M multiple sequence alignment
#!/usr/bin/env python
import math
import re
import click
from scipy.cluster.hierarchy import fcluster, linkage
from skbio import DistanceMatrix, Protein, TabularMSA, io
from skbio.sequence.distance import hamming
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.