Skip to content

Instantly share code, notes, and snippets.

@Poorvak
Created May 30, 2017 09:41
Show Gist options
  • Save Poorvak/58801734364035716a39c80906f68615 to your computer and use it in GitHub Desktop.
Save Poorvak/58801734364035716a39c80906f68615 to your computer and use it in GitHub Desktop.
Jupyter notebook for dataset cleaning task
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"'''\n",
"Imports for the cleaner code.\n",
"'''\n",
"import re\n",
"import itertools\n",
"import numpy as np\n",
"\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def clean_str(string, *args, **kwargs):\n",
" \"\"\"\n",
" Clean String method runs all the regexes to the string.\n",
" \n",
" Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py\n",
" :param string:\n",
" Datatype basestring that has to be cleaned.\n",
" \n",
" :return string:\n",
" Datatype basestring that is cleaned\n",
" \"\"\"\n",
" '''\n",
" Check for empty string.\n",
" '''\n",
" assert string, \"Empty string variable not allowed\"\n",
" '''\n",
" Checking variable string for datatype.\n",
" '''\n",
" if not isinstance(string, basestring):\n",
" raise Exception(\"string argument must be of type string|unicode\")\n",
"\n",
" string = re.sub(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \", string)\n",
" string = re.sub(r\"\\'s\", \" \\'s\", string)\n",
" string = re.sub(r\"\\'ve\", \" \\'ve\", string)\n",
" string = re.sub(r\"n\\'t\", \" n\\'t\", string)\n",
" string = re.sub(r\"\\'re\", \" \\'re\", string)\n",
" string = re.sub(r\"\\'d\", \" \\'d\", string)\n",
" string = re.sub(r\"\\'ll\", \" \\'ll\", string)\n",
" string = re.sub(r\",\", \" , \", string)\n",
" string = re.sub(r\"!\", \" ! \", string)\n",
" string = re.sub(r\"\\(\", \" \\( \", string)\n",
" string = re.sub(r\"\\)\", \" \\) \", string)\n",
" string = re.sub(r\"\\?\", \" \\? \", string)\n",
" string = re.sub(r\"\\s{2,}\", \" \", string)\n",
" return string.strip().lower()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'this is sample text 1 b hi b need to remove brackets'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment