Created
May 30, 2017 09:41
-
-
Save Poorvak/58801734364035716a39c80906f68615 to your computer and use it in GitHub Desktop.
Jupyter notebook for dataset cleaning task
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"'''\n", | |
"Imports for the cleaner code.\n", | |
"'''\n", | |
"import re\n", | |
"import itertools\n", | |
"import numpy as np\n", | |
"\n", | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def clean_str(string, *args, **kwargs):\n", | |
" \"\"\"\n", | |
" Clean String method runs all the regexes to the string.\n", | |
" \n", | |
" Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py\n", | |
" :param string:\n", | |
" Datatype basestring that has to be cleaned.\n", | |
" \n", | |
" :return string:\n", | |
" Datatype basestring that is cleaned\n", | |
" \"\"\"\n", | |
" '''\n", | |
" Check for empty string.\n", | |
" '''\n", | |
" assert string, \"Empty string variable not allowed\"\n", | |
" '''\n", | |
" Checking variable string for datatype.\n", | |
" '''\n", | |
" if not isinstance(string, basestring):\n", | |
" raise Exception(\"string argument must be of type string|unicode\")\n", | |
"\n", | |
" string = re.sub(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \", string)\n", | |
" string = re.sub(r\"\\'s\", \" \\'s\", string)\n", | |
" string = re.sub(r\"\\'ve\", \" \\'ve\", string)\n", | |
" string = re.sub(r\"n\\'t\", \" n\\'t\", string)\n", | |
" string = re.sub(r\"\\'re\", \" \\'re\", string)\n", | |
" string = re.sub(r\"\\'d\", \" \\'d\", string)\n", | |
" string = re.sub(r\"\\'ll\", \" \\'ll\", string)\n", | |
" string = re.sub(r\",\", \" , \", string)\n", | |
" string = re.sub(r\"!\", \" ! \", string)\n", | |
" string = re.sub(r\"\\(\", \" \\( \", string)\n", | |
" string = re.sub(r\"\\)\", \" \\) \", string)\n", | |
" string = re.sub(r\"\\?\", \" \\? \", string)\n", | |
" string = re.sub(r\"\\s{2,}\", \" \", string)\n", | |
" return string.strip().lower()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'this is sample text 1 b hi b need to remove brackets'" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment