Created
April 21, 2015 12:44
-
-
Save xccds/6dfd67737f53aa40f50a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 用spark进行数据挖掘\n", | |
"\n", | |
"- 本例使用spark的python接口,对titanic数据做了一个完整的尝试\n", | |
"- 首先用算质数的例子显示,即使在单机中,spark利用了多核处理能提高计算效率\n", | |
"- 之后读入数据集,并对数据进行预处理\n", | |
" - 步骤1:对名字进行了处理,用正则取出四种常见title\n", | |
" - 步骤2:基于title,对年龄进行了缺失值处理\n", | |
" - 步骤3:将类别变量均转为0-1变量\n", | |
"- 数据合并整理成spark.mllib需要的格式\n", | |
"- 使用线性模型建模,并得出错误率\n", | |
"- 本例代码参考了《machine learning with spark》一书" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark import SparkContext\n", | |
"sc = SparkContext( 'local[4]')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- 算质数的例子" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def isprime(n):\n", | |
" \"\"\"\n", | |
" check if integer n is a prime\n", | |
" \"\"\"\n", | |
" # make sure n is a positive integer\n", | |
" n = abs(int(n))\n", | |
" # 0 and 1 are not primes\n", | |
" if n < 2:\n", | |
" return False\n", | |
" # 2 is the only even prime number\n", | |
" if n == 2:\n", | |
" return True\n", | |
" # all other even numbers are not primes\n", | |
" if not n & 1:\n", | |
" return False\n", | |
" # range starts with 3 and only needs to go up the square root of n\n", | |
" # for all odd numbers\n", | |
" for x in range(3, int(n**0.5)+1, 2):\n", | |
" if n % x == 0:\n", | |
" return False\n", | |
" return True" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"78498\n", | |
"78498\n", | |
"78498\n", | |
"78498\n", | |
"1 loops, best of 3: 4.81 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"import numpy as np\n", | |
"nums = xrange(1000000)\n", | |
"print np.sum([1 for x in nums if isprime(x)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"78498\n", | |
"78498\n", | |
"78498\n", | |
"78498\n", | |
"1 loops, best of 3: 2.71 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"nums = sc.parallelize(xrange(1000000))\n", | |
"print nums.filter(isprime).count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- titanic例子,先读入变量名" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"vname = !head -1 titanic.csv\n", | |
"vname = vname[0].split(',')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#!sed 1d titanic.csv > titanic_noheader.csv\n", | |
"raw = sc.textFile('titanic_noheader.csv')\n", | |
"raw.first() # 原始数据" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- 数据预处理" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 处理title\n", | |
"def extract_name(x):\n", | |
" import re\n", | |
" return re.search(\"\\\"(.*)\\\"\", x).group(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'Braund, Mr. Owen Harris',\n", | |
" u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n", | |
" u'Heikkinen, Miss. Laina',\n", | |
" u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"names = raw.map(extract_name)\n", | |
"names.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(u'Mr', 517),\n", | |
" (u'Miss', 182),\n", | |
" (u'Mrs', 125),\n", | |
" (u'Master', 40),\n", | |
" (u'Dr', 7),\n", | |
" (u'Rev', 6),\n", | |
" (u'Major', 2),\n", | |
" (u'Mlle', 2),\n", | |
" (u'Col', 2),\n", | |
" (u'Sir', 1),\n", | |
" (u'the Countess', 1),\n", | |
" (u'Don', 1),\n", | |
" (u'Capt', 1),\n", | |
" (u'Lady', 1),\n", | |
" (u'Jonkheer', 1),\n", | |
" (u'Ms', 1),\n", | |
" (u'Mme', 1)]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'Mr', u'Miss', u'Mrs', u'Master']" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n", | |
"top_title" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def assign_title(x):\n", | |
" if x in top_title: return x\n", | |
" else: return u'other'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'Mr', u'Mrs', u'Miss', u'Mrs']" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"title_less = title.map(assign_title)\n", | |
"title_less.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 处理其它数据\n", | |
"def split_rest(x):\n", | |
" import re\n", | |
" rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n", | |
" return rec.split(',')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = raw.map(split_rest)\n", | |
"df.first()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 观察数据\n", | |
"vname.remove('name')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0th variable:survived distinct value: 2\n", | |
"1th variable:pclass distinct value: 3\n", | |
"2th variable:sex distinct value: 2\n", | |
"3th variable:age distinct value: 89\n", | |
"4th variable:sibsp distinct value: 7\n", | |
"5th variable:parch distinct value: 7\n", | |
"6th variable:ticket distinct value: 681\n", | |
"7th variable:fare distinct value: 248\n", | |
"8th variable:cabin distinct value: 148\n", | |
"9th variable:embarked distinct value: 4\n" | |
] | |
} | |
], | |
"source": [ | |
"# 取值个数\n", | |
"m = len(df.first())\n", | |
"for i in range(m):\n", | |
" print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0th variable:survived miss value: 0\n", | |
"1th variable:pclass miss value: 0\n", | |
"2th variable:sex miss value: 0\n", | |
"3th variable:age miss value: 177\n", | |
"4th variable:sibsp miss value: 0\n", | |
"5th variable:parch miss value: 0\n", | |
"6th variable:ticket miss value: 0\n", | |
"7th variable:fare miss value: 0\n", | |
"8th variable:cabin miss value: 687\n", | |
"9th variable:embarked miss value: 2\n" | |
] | |
} | |
], | |
"source": [ | |
"# 缺失个数\n", | |
"for i in range(m):\n", | |
" print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 处理年龄缺失\n", | |
"age = df.map(lambda x: x[3])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"title_age = title.zip(age)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def miss_mean(data):\n", | |
" res = [x for x in data if x!=-1]\n", | |
" return np.mean(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{u'Capt': 70.0,\n", | |
" u'Col': 58.0,\n", | |
" u'Don': 40.0,\n", | |
" u'Dr': 42.0,\n", | |
" u'Jonkheer': 38.0,\n", | |
" u'Lady': 48.0,\n", | |
" u'Major': 48.5,\n", | |
" u'Master': 4.5741666666666667,\n", | |
" u'Miss': 21.773972602739725,\n", | |
" u'Mlle': 24.0,\n", | |
" u'Mme': 24.0,\n", | |
" u'Mr': 32.368090452261306,\n", | |
" u'Mrs': 35.898148148148145,\n", | |
" u'Ms': 28.0,\n", | |
" u'Rev': 43.166666666666664,\n", | |
" u'Sir': 49.0,\n", | |
" u'the Countess': 33.0}" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"age_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def age_func((title,age)):\n", | |
" if age== -1: res = (title, age_dict[title])\n", | |
" else: res = (title, age)\n", | |
" return res" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[22.0, 38.0, 26.0, 35.0]" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"title_age = title_age.map(age_func)\n", | |
"age_imputed = title_age.values()\n", | |
"age_imputed.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 处理 embarked缺失\n", | |
"df.map(lambda record: record[9]).countByValue()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def embarked_func(record):\n", | |
" if record[9]=='' : return u'S' \n", | |
" else: return record[9]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"embarked= df.map(embarked_func)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 将四个类别变量转为0-1二元变量" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n", | |
"title_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def create_vector(term, term_dict):\n", | |
" #from scipy import sparse as sp\n", | |
" num_terms = len(term_dict)\n", | |
" #x = sp.csc_matrix((1, num_terms))\n", | |
" x = [0]*num_terms\n", | |
" idx = term_dict[term]\n", | |
" x[idx] = 1\n", | |
" return x" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[0, 1, 0, 0, 0]" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"create_vector(u'Master',title_dict)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n", | |
"title_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{u'1': 0, u'2': 2, u'3': 1}" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n", | |
"pclass_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n", | |
"pclass_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{u'C': 2, u'Q': 0, u'S': 1}" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n", | |
"embarked_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n", | |
"embarked_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [0, 1, 0, 7.25]),\n", | |
" (1, [1, 1, 0, 71.2833]),\n", | |
" (2, [1, 0, 0, 7.925]),\n", | |
" (3, [1, 1, 0, 53.1])]" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 合并数据\n", | |
"restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n", | |
"restdf.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [0, 0, 0, 1, 0]),\n", | |
" (1, [0, 0, 0, 0, 1]),\n", | |
" (2, [1, 0, 0, 0, 0]),\n", | |
" (3, [0, 0, 0, 0, 1])]" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
"title_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
"pclass_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
"embarked_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [1]), (1, [0]), (2, [0]), (3, [0])]" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n", | |
"gender_ind.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n", | |
"age_imputed.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n", | |
"finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n", | |
"finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n", | |
"finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n", | |
"finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n", | |
" (384,\n", | |
" [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n", | |
" (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n", | |
" (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"finaldf.take(4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 准备建模需要格式\n", | |
"from pyspark.mllib.classification import LogisticRegressionWithSGD\n", | |
"from pyspark.mllib.regression import LabeledPoint\n", | |
"def parsePoint(line):\n", | |
" features = line[1][1:]\n", | |
" target = line[1][0]\n", | |
" return LabeledPoint(target, features)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"modeldata = finaldf.map(parsePoint)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"modeldata.first()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 数据切分\n", | |
"train, test = modeldata.randomSplit([0.75,0.25])\n", | |
"# 建模\n", | |
"model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Training Error = 0.308056872038\n" | |
] | |
} | |
], | |
"source": [ | |
"# 评估\n", | |
"labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n", | |
"testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n", | |
"print(\"Training Error = \" + str(testErr))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment