Last active October 2, 2024 16:57
Use Dask to write a dataset to Hugging Face in a distributed manner
import math
import tempfile
from functools import partial
import dask.dataframe as dd
import pandas as pd
from huggingface_hub import CommitOperationAdd, HfFileSystem
def _preupload(df: pd.DataFrame, path: str, filesystem: HfFileSystem, **kwargs) -> pd.DataFrame:
Created March 30, 2023 15:47
GitHub: Clone, Checkout and open VSCode to PR from its URL
function getJsonVal () {
python -c "import json,sys;sys.stdout.write(str(json.load(sys.stdin)$1))";
apiUrl=$(echo "$prUrl" | sed -e 's/pull/pulls/g' -e 's/\/repos/g')
upstreamUrl=$(echo "$prUrl" | sed -re 's/\/pull\/[0-9]+/.git/g')
prData=`curl "$apiUrl"`
userName=$(echo $prData | getJsonVal "['head']['repo']['owner']['login']")
repoName=$(echo $prData | getJsonVal "['head']['repo']['name']")
repoFullName=$(echo $prData | getJsonVal "['head']['repo']['full_name']")
Created June 15, 2020 19:08
english wikipedia length
from nlp import load_dataset
from import tqdm
wiki = load_dataset('wikipedia', '20200501.en', split="train")
batch_size = 1000
total_length = 0
for i in tqdm(range(0, len(wiki), batch_size)): # loop takes ~1min to run
batch = wiki[i:i + batch_size]
total_length += sum(len(sample_text) for sample_text in batch["text"])