Skip to content

Instantly share code, notes, and snippets.

@alantian
Created June 20, 2017 07:14
Show Gist options
  • Save alantian/77f150d756854b719031c31004e87b60 to your computer and use it in GitHub Desktop.
Save alantian/77f150d756854b719031c31004e87b60 to your computer and use it in GitHub Desktop.
convert files to UTF-8, recursively.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2017-06-20T06:08:16.054908Z",
"start_time": "2017-06-20T06:08:15.995673Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"import chardet\n",
"import os\n",
"import re\n",
"\n",
"from joblib import Parallel, delayed"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2017-06-20T06:12:27.970634Z",
"start_time": "2017-06-20T06:12:27.962524Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"def read_with_encoding_guessing(filename):\n",
" rawdata = open(filename, 'rb').read()\n",
" charset_guessing = chardet.detect(rawdata)\n",
" encoding = charset_guessing['encoding']\n",
" try:\n",
" content = open(filename, encoding=encoding).read()\n",
" return {'content':content, 'success': True}\n",
" except UnicodeDecodeError:\n",
" return {'success': False}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2017-06-20T06:12:46.062942Z",
"start_time": "2017-06-20T06:12:46.009853Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"input_dir = 'text-data/Text/Plain/'\n",
"output_dir = 'text-data-utf8/'\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"filepath_list = []\n",
"for dir_path, _, filename_list in os.walk(input_dir):\n",
" for filename in filename_list:\n",
" if not filename.startswith('.'):\n",
" filepath = os.path.join(dir_path, filename)\n",
" filepath_list.append(filepath)\n",
" \n",
"def go(filepath):\n",
" decoded_data = read_with_encoding_guessing(filepath)\n",
" if decoded_data['success']:\n",
" content = decoded_data['content']\n",
" output_filepath = os.path.join(output_dir, filepath.replace(input_dir, '').replace('/', '-'))\n",
" with open(output_filepath, 'w') as fout:\n",
" print(content, file=fout)\n",
" else:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2017-06-20T06:23:52.804702Z",
"start_time": "2017-06-20T06:12:46.650828Z"
},
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.0280s.) Setting batch_size=14.\n",
"[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.2s\n",
"[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.8s\n",
"[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 5.8s\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (6.9398s.) Setting batch_size=7.\n",
"[Parallel(n_jobs=-1)]: Done 142 tasks | elapsed: 23.8s\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (24.3416s.) Setting batch_size=3.\n",
"[Parallel(n_jobs=-1)]: Done 247 tasks | elapsed: 35.8s\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (10.6953s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 323 tasks | elapsed: 44.7s\n",
"[Parallel(n_jobs=-1)]: Done 348 tasks | elapsed: 48.0s\n",
"[Parallel(n_jobs=-1)]: Done 367 tasks | elapsed: 52.9s\n",
"[Parallel(n_jobs=-1)]: Done 386 tasks | elapsed: 54.1s\n",
"[Parallel(n_jobs=-1)]: Done 401 tasks | elapsed: 57.0s\n",
"[Parallel(n_jobs=-1)]: Done 416 tasks | elapsed: 59.5s\n",
"[Parallel(n_jobs=-1)]: Done 433 tasks | elapsed: 1.0min\n",
"[Parallel(n_jobs=-1)]: Done 450 tasks | elapsed: 1.1min\n",
"[Parallel(n_jobs=-1)]: Done 469 tasks | elapsed: 1.1min\n",
"[Parallel(n_jobs=-1)]: Done 488 tasks | elapsed: 1.1min\n",
"[Parallel(n_jobs=-1)]: Done 509 tasks | elapsed: 1.2min\n",
"[Parallel(n_jobs=-1)]: Done 530 tasks | elapsed: 1.3min\n",
"[Parallel(n_jobs=-1)]: Done 553 tasks | elapsed: 1.4min\n",
"[Parallel(n_jobs=-1)]: Done 576 tasks | elapsed: 1.5min\n",
"[Parallel(n_jobs=-1)]: Done 601 tasks | elapsed: 1.6min\n",
"[Parallel(n_jobs=-1)]: Done 626 tasks | elapsed: 1.6min\n",
"[Parallel(n_jobs=-1)]: Done 653 tasks | elapsed: 1.7min\n",
"[Parallel(n_jobs=-1)]: Done 680 tasks | elapsed: 1.7min\n",
"[Parallel(n_jobs=-1)]: Done 709 tasks | elapsed: 1.8min\n",
"[Parallel(n_jobs=-1)]: Done 738 tasks | elapsed: 1.8min\n",
"[Parallel(n_jobs=-1)]: Done 769 tasks | elapsed: 1.9min\n",
"[Parallel(n_jobs=-1)]: Done 800 tasks | elapsed: 1.9min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1813s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Done 842 tasks | elapsed: 2.0min\n",
"[Parallel(n_jobs=-1)]: Done 907 tasks | elapsed: 2.0min\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0899s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 963 tasks | elapsed: 2.1min\n",
"[Parallel(n_jobs=-1)]: Done 1000 tasks | elapsed: 2.1min\n",
"[Parallel(n_jobs=-1)]: Done 1037 tasks | elapsed: 2.1min\n",
"[Parallel(n_jobs=-1)]: Done 1074 tasks | elapsed: 2.2min\n",
"[Parallel(n_jobs=-1)]: Done 1113 tasks | elapsed: 2.2min\n",
"[Parallel(n_jobs=-1)]: Done 1152 tasks | elapsed: 2.3min\n",
"[Parallel(n_jobs=-1)]: Done 1193 tasks | elapsed: 2.3min\n",
"[Parallel(n_jobs=-1)]: Done 1234 tasks | elapsed: 2.4min\n",
"[Parallel(n_jobs=-1)]: Done 1277 tasks | elapsed: 2.5min\n",
"[Parallel(n_jobs=-1)]: Done 1320 tasks | elapsed: 2.6min\n",
"[Parallel(n_jobs=-1)]: Done 1365 tasks | elapsed: 2.7min\n",
"[Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 2.8min\n",
"[Parallel(n_jobs=-1)]: Done 1457 tasks | elapsed: 2.9min\n",
"[Parallel(n_jobs=-1)]: Done 1504 tasks | elapsed: 2.9min\n",
"[Parallel(n_jobs=-1)]: Done 1553 tasks | elapsed: 3.1min\n",
"[Parallel(n_jobs=-1)]: Done 1602 tasks | elapsed: 3.4min\n",
"[Parallel(n_jobs=-1)]: Done 1653 tasks | elapsed: 3.6min\n",
"[Parallel(n_jobs=-1)]: Done 1704 tasks | elapsed: 3.6min\n",
"[Parallel(n_jobs=-1)]: Done 1757 tasks | elapsed: 3.6min\n",
"[Parallel(n_jobs=-1)]: Done 1810 tasks | elapsed: 3.6min\n",
"[Parallel(n_jobs=-1)]: Done 1865 tasks | elapsed: 3.7min\n",
"[Parallel(n_jobs=-1)]: Done 1920 tasks | elapsed: 3.7min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1885s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Done 1977 tasks | elapsed: 3.7min\n",
"[Parallel(n_jobs=-1)]: Done 2078 tasks | elapsed: 3.8min\n",
"[Parallel(n_jobs=-1)]: Done 2196 tasks | elapsed: 3.9min\n",
"[Parallel(n_jobs=-1)]: Done 2314 tasks | elapsed: 4.0min\n",
"[Parallel(n_jobs=-1)]: Done 2436 tasks | elapsed: 4.1min\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0347s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 2518 tasks | elapsed: 4.1min\n",
"[Parallel(n_jobs=-1)]: Done 2581 tasks | elapsed: 4.2min\n",
"[Parallel(n_jobs=-1)]: Done 2644 tasks | elapsed: 4.3min\n",
"[Parallel(n_jobs=-1)]: Done 2709 tasks | elapsed: 4.4min\n",
"[Parallel(n_jobs=-1)]: Done 2774 tasks | elapsed: 4.5min\n",
"[Parallel(n_jobs=-1)]: Done 2841 tasks | elapsed: 4.5min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1907s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Done 2910 tasks | elapsed: 4.5min\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0549s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 3039 tasks | elapsed: 4.7min\n",
"[Parallel(n_jobs=-1)]: Done 3109 tasks | elapsed: 4.8min\n",
"[Parallel(n_jobs=-1)]: Done 3180 tasks | elapsed: 4.9min\n",
"[Parallel(n_jobs=-1)]: Done 3251 tasks | elapsed: 5.0min\n",
"[Parallel(n_jobs=-1)]: Done 3324 tasks | elapsed: 5.2min\n",
"[Parallel(n_jobs=-1)]: Done 3397 tasks | elapsed: 5.2min\n",
"[Parallel(n_jobs=-1)]: Done 3472 tasks | elapsed: 5.3min\n",
"[Parallel(n_jobs=-1)]: Done 3547 tasks | elapsed: 5.3min\n",
"[Parallel(n_jobs=-1)]: Done 3624 tasks | elapsed: 5.4min\n",
"[Parallel(n_jobs=-1)]: Done 3701 tasks | elapsed: 5.4min\n",
"[Parallel(n_jobs=-1)]: Done 3780 tasks | elapsed: 5.5min\n",
"[Parallel(n_jobs=-1)]: Done 3859 tasks | elapsed: 5.7min\n",
"[Parallel(n_jobs=-1)]: Done 3940 tasks | elapsed: 5.8min\n",
"[Parallel(n_jobs=-1)]: Done 4021 tasks | elapsed: 5.8min\n",
"[Parallel(n_jobs=-1)]: Done 4104 tasks | elapsed: 5.9min\n",
"[Parallel(n_jobs=-1)]: Done 4187 tasks | elapsed: 6.1min\n",
"[Parallel(n_jobs=-1)]: Done 4272 tasks | elapsed: 6.1min\n",
"[Parallel(n_jobs=-1)]: Done 4357 tasks | elapsed: 6.3min\n",
"[Parallel(n_jobs=-1)]: Done 4444 tasks | elapsed: 6.4min\n",
"[Parallel(n_jobs=-1)]: Done 4531 tasks | elapsed: 6.5min\n",
"[Parallel(n_jobs=-1)]: Done 4620 tasks | elapsed: 6.6min\n",
"[Parallel(n_jobs=-1)]: Done 4709 tasks | elapsed: 6.6min\n",
"[Parallel(n_jobs=-1)]: Done 4800 tasks | elapsed: 6.7min\n",
"[Parallel(n_jobs=-1)]: Done 4891 tasks | elapsed: 6.9min\n",
"[Parallel(n_jobs=-1)]: Done 4984 tasks | elapsed: 7.1min\n",
"[Parallel(n_jobs=-1)]: Done 5077 tasks | elapsed: 7.3min\n",
"[Parallel(n_jobs=-1)]: Done 5172 tasks | elapsed: 7.5min\n",
"[Parallel(n_jobs=-1)]: Done 5267 tasks | elapsed: 7.6min\n",
"[Parallel(n_jobs=-1)]: Done 5364 tasks | elapsed: 7.7min\n",
"[Parallel(n_jobs=-1)]: Done 5461 tasks | elapsed: 7.8min\n",
"[Parallel(n_jobs=-1)]: Done 5560 tasks | elapsed: 8.1min\n",
"[Parallel(n_jobs=-1)]: Done 5659 tasks | elapsed: 8.4min\n",
"[Parallel(n_jobs=-1)]: Done 5760 tasks | elapsed: 8.7min\n",
"[Parallel(n_jobs=-1)]: Done 5861 tasks | elapsed: 8.9min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1834s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.1279s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 5986 tasks | elapsed: 9.0min\n",
"[Parallel(n_jobs=-1)]: Done 6089 tasks | elapsed: 9.2min\n",
"[Parallel(n_jobs=-1)]: Done 6194 tasks | elapsed: 9.4min\n",
"[Parallel(n_jobs=-1)]: Done 6299 tasks | elapsed: 9.5min\n",
"[Parallel(n_jobs=-1)]: Done 6406 tasks | elapsed: 9.7min\n",
"[Parallel(n_jobs=-1)]: Done 6513 tasks | elapsed: 9.9min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1914s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0710s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 6645 tasks | elapsed: 10.0min\n",
"[Parallel(n_jobs=-1)]: Done 6755 tasks | elapsed: 10.1min\n",
"[Parallel(n_jobs=-1)]: Done 6866 tasks | elapsed: 10.2min\n",
"[Parallel(n_jobs=-1)]: Done 6977 tasks | elapsed: 10.2min\n",
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1766s.) Setting batch_size=2.\n",
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.3176s.) Setting batch_size=1.\n",
"[Parallel(n_jobs=-1)]: Done 7121 tasks | elapsed: 10.6min\n",
"[Parallel(n_jobs=-1)]: Done 7190 out of 7190 | elapsed: 11.1min finished\n"
]
},
{
"data": {
"text/plain": [
"[None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" ...]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Parallel(n_jobs=-1, verbose=10)(delayed(go)(filepath) for filepath in filepath_list)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2017-06-20T06:47:50.951197Z",
"start_time": "2017-06-20T06:47:27.629215Z"
},
"collapsed": false
},
"outputs": [],
"source": [
"! 7z -y -bsp0 -bso0 a text-data-utf8.7z text-data-utf8"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 ML",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment