Nay San fauxneticien

fauxneticien / mock-args.py

Created March 29, 2023 20:37

Create mock arguments for future Python script

	# Mock Args class, to be replaced with argparse or equivalent later
	class Args:
	def __init__(self, **attrs):
	self.__dict__.update(attrs)

	args = Args(**{
	"input_basepath" : "/home/nay/git-repos/w2v2-10min-exps/data/MASS-1h",
	"train_tsv" : "train1h.tsv",
	"valid_tsv" : "dev.tsv",
	"output_basepath" : "/home/nay/librispeech-1h-fairseq/manifest-MASS-1h"

fauxneticien / fairseq-asr.Dockerfile

Created March 28, 2023 23:56

Dockerfile for fauxneticien/fairseq-asr image

	# NVIDIA official container with CUDA 11.7 (note pytorch-cuda=11.7 in PyTorch install below)
	# See https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
	FROM nvcr.io/nvidia/pytorch:22.05-py3

	# Update container PyTorch from 1.9 to 1.13.1
	RUN conda install pytorch==1.13.1 \
	torchvision==0.14.1 \
	torchaudio==0.13.1 \
	pytorch-cuda=11.7 \
	-c pytorch \

fauxneticien / reddit.R

Created November 29, 2022 19:20

Scrape from Reddit API

	library(httr)
	library(purrr)
	library(tibble)

	# See also https://bookdown.org/paul/apis_for_social_scientists/reddit-api.html

	url <- 'https://www.reddit.com/r/mentalhealth/new.json?t=day&limit=100'

	response <- GET(url, user_agent('Extracting data from Reddit'))

fauxneticien / tqdm-mp.py

Created November 10, 2022 19:05

MWE for tqdm multiprocessing

	from multiprocessing import Pool
	import tqdm
	import time

	def _foo(my_number):
	square = my_number * my_number
	time.sleep(1)
	return square

	if __name__ == '__main__':

fauxneticien / convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py

Created November 5, 2022 18:58

Try to fix HuggingFace's conversion script

	# coding=utf-8
	# Copyright 2021 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software

fauxneticien / eval_torchaudio-decoder.py

Last active November 1, 2022 18:04

Evaluate wav2vec 2.0 model with language model using torchaudio's CTC decoder

	import torch
	import torchaudio

	from torchaudio.models.decoder import ctc_decoder
	from typing import List

	bundle = torchaudio.pipelines.WAV2VEC2_ASR_LARGE_10M
	acoustic_model = bundle.get_model()
	acoustic_model.to('cuda')

fauxneticien / ms_to_hms.py

Created October 2, 2022 23:30

Convert milliseconds to hours-minutes-seconds string format to use within identifiers

	def ms_to_hms(start_ms: int) -> str:
	import math

	"""
	Convert milliseconds to hours-minutes-seconds string format to use within identifiers

	e.g. 3990060 ms -> '01h06m30.060'
	"""

	s, ms = divmod(start_ms, 1000)

fauxneticien / get_vocab.py

Created September 3, 2022 17:35

Script to process vocabulary

	import pandas as pd

	from collections import Counter
	from tqdm.contrib.concurrent import process_map

	def get_vocab(texts_list, ids_list=None):

	def sum_counters(counter_list):

	'''

fauxneticien / const.py

Created June 22, 2022 12:21

Python via phonology

	# Abstract class ('phoneme')
	class Consonant:
	# Class attributes assigned when initialised ('realised')
	def __init__(self, place_of_art, manner_of_art):
	self.place_of_art = place_of_art
	self.manner_of_art = manner_of_art

	def lenite(self):
	if self.manner_of_art == 'stop':
	self.manner_of_art = 'fricative'

fauxneticien / expand_grid.py

Created June 4, 2022 16:03

Python equivalent for R's expand grid

	# Adapted from https://stackoverflow.com/questions/12130883/r-expand-grid-function-in-python
	#
	# Usage:
	# expand_grid([0, 1], [2,3,4])
	#
	# Output:
	# array([[0, 2],
	# [1, 2],
	# [0, 3],
	# [1, 3],