Skip to content

Instantly share code, notes, and snippets.

View fauxneticien's full-sized avatar

Nay San fauxneticien

View GitHub Profile
@fauxneticien
fauxneticien / mock-args.py
Created March 29, 2023 20:37
Create mock arguments for future Python script
# Mock Args class, to be replaced with argparse or equivalent later
class Args:
def __init__(self, **attrs):
self.__dict__.update(attrs)
args = Args(**{
"input_basepath" : "/home/nay/git-repos/w2v2-10min-exps/data/MASS-1h",
"train_tsv" : "train1h.tsv",
"valid_tsv" : "dev.tsv",
"output_basepath" : "/home/nay/librispeech-1h-fairseq/manifest-MASS-1h"
@fauxneticien
fauxneticien / fairseq-asr.Dockerfile
Created March 28, 2023 23:56
Dockerfile for fauxneticien/fairseq-asr image
# NVIDIA official container with CUDA 11.7 (note pytorch-cuda=11.7 in PyTorch install below)
# See https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
FROM nvcr.io/nvidia/pytorch:22.05-py3
# Update container PyTorch from 1.9 to 1.13.1
RUN conda install pytorch==1.13.1 \
torchvision==0.14.1 \
torchaudio==0.13.1 \
pytorch-cuda=11.7 \
-c pytorch \
@fauxneticien
fauxneticien / reddit.R
Created November 29, 2022 19:20
Scrape from Reddit API
library(httr)
library(purrr)
library(tibble)
# See also https://bookdown.org/paul/apis_for_social_scientists/reddit-api.html
url <- 'https://www.reddit.com/r/mentalhealth/new.json?t=day&limit=100'
response <- GET(url, user_agent('Extracting data from Reddit'))
@fauxneticien
fauxneticien / tqdm-mp.py
Created November 10, 2022 19:05
MWE for tqdm multiprocessing
from multiprocessing import Pool
import tqdm
import time
def _foo(my_number):
square = my_number * my_number
time.sleep(1)
return square
if __name__ == '__main__':
@fauxneticien
fauxneticien / convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
Created November 5, 2022 18:58
Try to fix HuggingFace's conversion script
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
@fauxneticien
fauxneticien / eval_torchaudio-decoder.py
Last active November 1, 2022 18:04
Evaluate wav2vec 2.0 model with language model using torchaudio's CTC decoder
import torch
import torchaudio
from torchaudio.models.decoder import ctc_decoder
from typing import List
bundle = torchaudio.pipelines.WAV2VEC2_ASR_LARGE_10M
acoustic_model = bundle.get_model()
acoustic_model.to('cuda')
@fauxneticien
fauxneticien / ms_to_hms.py
Created October 2, 2022 23:30
Convert milliseconds to hours-minutes-seconds string format to use within identifiers
def ms_to_hms(start_ms: int) -> str:
import math
"""
Convert milliseconds to hours-minutes-seconds string format to use within identifiers
e.g. 3990060 ms -> '01h06m30.060'
"""
s, ms = divmod(start_ms, 1000)
@fauxneticien
fauxneticien / get_vocab.py
Created September 3, 2022 17:35
Script to process vocabulary
import pandas as pd
from collections import Counter
from tqdm.contrib.concurrent import process_map
def get_vocab(texts_list, ids_list=None):
def sum_counters(counter_list):
'''
@fauxneticien
fauxneticien / const.py
Created June 22, 2022 12:21
Python via phonology
# Abstract class ('phoneme')
class Consonant:
# Class attributes assigned when initialised ('realised')
def __init__(self, place_of_art, manner_of_art):
self.place_of_art = place_of_art
self.manner_of_art = manner_of_art
def lenite(self):
if self.manner_of_art == 'stop':
self.manner_of_art = 'fricative'
@fauxneticien
fauxneticien / expand_grid.py
Created June 4, 2022 16:03
Python equivalent for R's expand grid
# Adapted from https://stackoverflow.com/questions/12130883/r-expand-grid-function-in-python
#
# Usage:
# expand_grid([0, 1], [2,3,4])
#
# Output:
# array([[0, 2],
# [1, 2],
# [0, 3],
# [1, 3],