david30907d · August 30, 2017 13:15
diff --git a/spark.ipynb b/spark.ipynb
diff --git a/以前寫過的spark.py b/以前寫過的spark.py
 import json
 t=sc.textFile('Rides_0310.csv')
 header = t.map(lambda x:x.split(',')).first()
 data = t.map(lambda x:x.split(',')).filter(lambda x:'Id' not in x)
 dataDict = data.map(lambda x:dict(zip(header, x)))
 stop_and_time = dataDict.flatMap(lambda x:((x['BoardStop'], x['BoardTime']), (x['AlightStop'], x['AlightTime'])))

 def groupDate(x):
    # s = '2015-05-06 07:00:00.000'
    x = list(x)
    result = []
    for i in x:
        tmp = {}
        for j in map(lambda x:x[:x.rfind(':')][:-1], list(i[1])):
            tmp[j] = tmp.setdefault(j, 0) + 1
        result.append((i[0], tmp))
    return result

 stop=stop_and_time.groupByKey().mapPartitions(groupDate).collect()
 json.dump(dict(stop), open('zhou.json','w'))
diff --git a/決策數.ipynb b/決策數.ipynb
	import json
	t=sc.textFile('Rides_0310.csv')
	header = t.map(lambda x:x.split(',')).first()
	data = t.map(lambda x:x.split(',')).filter(lambda x:'Id' not in x)
	dataDict = data.map(lambda x:dict(zip(header, x)))
	stop_and_time = dataDict.flatMap(lambda x:((x['BoardStop'], x['BoardTime']), (x['AlightStop'], x['AlightTime'])))

	def groupDate(x):
	# s = '2015-05-06 07:00:00.000'
	x = list(x)
	result = []
	for i in x:
	tmp = {}
	for j in map(lambda x:x[:x.rfind(':')][:-1], list(i[1])):
	tmp[j] = tmp.setdefault(j, 0) + 1
	result.append((i[0], tmp))
	return result

	stop=stop_and_time.groupByKey().mapPartitions(groupDate).collect()
	json.dump(dict(stop), open('zhou.json','w'))