Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Created June 11, 2021 15:22
Show Gist options
  • Save jamescalam/aa60ff5cf436ef58f94eb945ae2f266e to your computer and use it in GitHub Desktop.
Save jamescalam/aa60ff5cf436ef58f94eb945ae2f266e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"100%|██████████| 28522082/28522082 [33:32<00:00, 14173.48it/s]\n"
]
}
],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
"text_data = []\n",
"file_count = 0\n",
"\n",
"for sample in tqdm(dataset['train']):\n",
" sample = sample['text'].replace('\\n', '')\n",
" text_data.append(sample)\n",
" if len(text_data) == 10_000:\n",
" # once we git the 10K mark, save to file\n",
" with open(f'../../data/text/oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:\n",
" fp.write('\\n'.join(text_data))\n",
" text_data = []\n",
" file_count += 1\n",
"# after saving in 10K chunks, we will have ~2082 leftover samples, we save those now too\n",
"with open(f'../../data/text/oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:\n",
" fp.write('\\n'.join(text_data))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ML",
"language": "python",
"name": "ml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment