Created
June 20, 2017 07:14
-
-
Save alantian/77f150d756854b719031c31004e87b60 to your computer and use it in GitHub Desktop.
convert files to UTF-8, recursively.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-06-20T06:08:16.054908Z", | |
"start_time": "2017-06-20T06:08:15.995673Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import chardet\n", | |
"import os\n", | |
"import re\n", | |
"\n", | |
"from joblib import Parallel, delayed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-06-20T06:12:27.970634Z", | |
"start_time": "2017-06-20T06:12:27.962524Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def read_with_encoding_guessing(filename):\n", | |
" rawdata = open(filename, 'rb').read()\n", | |
" charset_guessing = chardet.detect(rawdata)\n", | |
" encoding = charset_guessing['encoding']\n", | |
" try:\n", | |
" content = open(filename, encoding=encoding).read()\n", | |
" return {'content':content, 'success': True}\n", | |
" except UnicodeDecodeError:\n", | |
" return {'success': False}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-06-20T06:12:46.062942Z", | |
"start_time": "2017-06-20T06:12:46.009853Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"input_dir = 'text-data/Text/Plain/'\n", | |
"output_dir = 'text-data-utf8/'\n", | |
"os.makedirs(output_dir, exist_ok=True)\n", | |
"\n", | |
"filepath_list = []\n", | |
"for dir_path, _, filename_list in os.walk(input_dir):\n", | |
" for filename in filename_list:\n", | |
" if not filename.startswith('.'):\n", | |
" filepath = os.path.join(dir_path, filename)\n", | |
" filepath_list.append(filepath)\n", | |
" \n", | |
"def go(filepath):\n", | |
" decoded_data = read_with_encoding_guessing(filepath)\n", | |
" if decoded_data['success']:\n", | |
" content = decoded_data['content']\n", | |
" output_filepath = os.path.join(output_dir, filepath.replace(input_dir, '').replace('/', '-'))\n", | |
" with open(output_filepath, 'w') as fout:\n", | |
" print(content, file=fout)\n", | |
" else:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-06-20T06:23:52.804702Z", | |
"start_time": "2017-06-20T06:12:46.650828Z" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.0280s.) Setting batch_size=14.\n", | |
"[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.2s\n", | |
"[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.8s\n", | |
"[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 5.8s\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (6.9398s.) Setting batch_size=7.\n", | |
"[Parallel(n_jobs=-1)]: Done 142 tasks | elapsed: 23.8s\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (24.3416s.) Setting batch_size=3.\n", | |
"[Parallel(n_jobs=-1)]: Done 247 tasks | elapsed: 35.8s\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (10.6953s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 323 tasks | elapsed: 44.7s\n", | |
"[Parallel(n_jobs=-1)]: Done 348 tasks | elapsed: 48.0s\n", | |
"[Parallel(n_jobs=-1)]: Done 367 tasks | elapsed: 52.9s\n", | |
"[Parallel(n_jobs=-1)]: Done 386 tasks | elapsed: 54.1s\n", | |
"[Parallel(n_jobs=-1)]: Done 401 tasks | elapsed: 57.0s\n", | |
"[Parallel(n_jobs=-1)]: Done 416 tasks | elapsed: 59.5s\n", | |
"[Parallel(n_jobs=-1)]: Done 433 tasks | elapsed: 1.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 450 tasks | elapsed: 1.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 469 tasks | elapsed: 1.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 488 tasks | elapsed: 1.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 509 tasks | elapsed: 1.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 530 tasks | elapsed: 1.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 553 tasks | elapsed: 1.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 576 tasks | elapsed: 1.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 601 tasks | elapsed: 1.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 626 tasks | elapsed: 1.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 653 tasks | elapsed: 1.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 680 tasks | elapsed: 1.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 709 tasks | elapsed: 1.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 738 tasks | elapsed: 1.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 769 tasks | elapsed: 1.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 800 tasks | elapsed: 1.9min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1813s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Done 842 tasks | elapsed: 2.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 907 tasks | elapsed: 2.0min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0899s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 963 tasks | elapsed: 2.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 1000 tasks | elapsed: 2.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 1037 tasks | elapsed: 2.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 1074 tasks | elapsed: 2.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 1113 tasks | elapsed: 2.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 1152 tasks | elapsed: 2.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 1193 tasks | elapsed: 2.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 1234 tasks | elapsed: 2.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 1277 tasks | elapsed: 2.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 1320 tasks | elapsed: 2.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 1365 tasks | elapsed: 2.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 2.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 1457 tasks | elapsed: 2.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 1504 tasks | elapsed: 2.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 1553 tasks | elapsed: 3.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 1602 tasks | elapsed: 3.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 1653 tasks | elapsed: 3.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 1704 tasks | elapsed: 3.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 1757 tasks | elapsed: 3.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 1810 tasks | elapsed: 3.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 1865 tasks | elapsed: 3.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 1920 tasks | elapsed: 3.7min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1885s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Done 1977 tasks | elapsed: 3.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 2078 tasks | elapsed: 3.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 2196 tasks | elapsed: 3.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 2314 tasks | elapsed: 4.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 2436 tasks | elapsed: 4.1min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0347s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 2518 tasks | elapsed: 4.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 2581 tasks | elapsed: 4.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 2644 tasks | elapsed: 4.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 2709 tasks | elapsed: 4.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 2774 tasks | elapsed: 4.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 2841 tasks | elapsed: 4.5min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1907s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Done 2910 tasks | elapsed: 4.5min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0549s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 3039 tasks | elapsed: 4.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 3109 tasks | elapsed: 4.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 3180 tasks | elapsed: 4.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 3251 tasks | elapsed: 5.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 3324 tasks | elapsed: 5.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 3397 tasks | elapsed: 5.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 3472 tasks | elapsed: 5.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 3547 tasks | elapsed: 5.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 3624 tasks | elapsed: 5.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 3701 tasks | elapsed: 5.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 3780 tasks | elapsed: 5.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 3859 tasks | elapsed: 5.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 3940 tasks | elapsed: 5.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 4021 tasks | elapsed: 5.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 4104 tasks | elapsed: 5.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 4187 tasks | elapsed: 6.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 4272 tasks | elapsed: 6.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 4357 tasks | elapsed: 6.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 4444 tasks | elapsed: 6.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 4531 tasks | elapsed: 6.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 4620 tasks | elapsed: 6.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 4709 tasks | elapsed: 6.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 4800 tasks | elapsed: 6.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 4891 tasks | elapsed: 6.9min\n", | |
"[Parallel(n_jobs=-1)]: Done 4984 tasks | elapsed: 7.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 5077 tasks | elapsed: 7.3min\n", | |
"[Parallel(n_jobs=-1)]: Done 5172 tasks | elapsed: 7.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 5267 tasks | elapsed: 7.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 5364 tasks | elapsed: 7.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 5461 tasks | elapsed: 7.8min\n", | |
"[Parallel(n_jobs=-1)]: Done 5560 tasks | elapsed: 8.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 5659 tasks | elapsed: 8.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 5760 tasks | elapsed: 8.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 5861 tasks | elapsed: 8.9min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1834s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.1279s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 5986 tasks | elapsed: 9.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 6089 tasks | elapsed: 9.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 6194 tasks | elapsed: 9.4min\n", | |
"[Parallel(n_jobs=-1)]: Done 6299 tasks | elapsed: 9.5min\n", | |
"[Parallel(n_jobs=-1)]: Done 6406 tasks | elapsed: 9.7min\n", | |
"[Parallel(n_jobs=-1)]: Done 6513 tasks | elapsed: 9.9min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1914s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.0710s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 6645 tasks | elapsed: 10.0min\n", | |
"[Parallel(n_jobs=-1)]: Done 6755 tasks | elapsed: 10.1min\n", | |
"[Parallel(n_jobs=-1)]: Done 6866 tasks | elapsed: 10.2min\n", | |
"[Parallel(n_jobs=-1)]: Done 6977 tasks | elapsed: 10.2min\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too fast (0.1766s.) Setting batch_size=2.\n", | |
"[Parallel(n_jobs=-1)]: Batch computation too slow (2.3176s.) Setting batch_size=1.\n", | |
"[Parallel(n_jobs=-1)]: Done 7121 tasks | elapsed: 10.6min\n", | |
"[Parallel(n_jobs=-1)]: Done 7190 out of 7190 | elapsed: 11.1min finished\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" None,\n", | |
" ...]" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Parallel(n_jobs=-1, verbose=10)(delayed(go)(filepath) for filepath in filepath_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-06-20T06:47:50.951197Z", | |
"start_time": "2017-06-20T06:47:27.629215Z" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"! 7z -y -bsp0 -bso0 a text-data-utf8.7z text-data-utf8" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 ML", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
}, | |
"latex_envs": { | |
"LaTeX_envs_menu_present": true, | |
"autocomplete": true, | |
"bibliofile": "biblio.bib", | |
"cite_by": "apalike", | |
"current_citInitial": 1, | |
"eqLabelWithNumbers": true, | |
"eqNumInitial": 1, | |
"hotkeys": { | |
"equation": "Ctrl-E", | |
"itemize": "Ctrl-I" | |
}, | |
"labels_anchors": false, | |
"latex_user_defs": false, | |
"report_style_numbering": false, | |
"user_envs_cfg": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment