Skip to content

Instantly share code, notes, and snippets.

View bryanyang0528's full-sized avatar
🎯
Focusing

Bryan Yang bryanyang0528

🎯
Focusing
View GitHub Profile
val text = sc.textFile("README.md")
val textPairsRDD = text.flatMap(_.split(" ")).map((_,1))
textPairsRDD.take(10)
//res2: Array[(String, Int)] = Array((#,1), (Apache,1), (Spark,1), ("",1),
//(Spark,1), (is,1), (a,1), (fast,1), (and,1), (general,1))
@bryanyang0528
bryanyang0528 / sum.py
Created November 17, 2014 16:50
sum.py
sdef sumx(x):
total = 0
for i in x:
total =total + i
return total
sumx([1,2,3,4,5])
#15
num.iterations <- 1000
# Download South African heart disease data
sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)
x <- sa.heart[,c("age", "ldl")]
y <- sa.heart$chd
plot(x, pch=21, bg=c("red","green")[factor(y)])
# Function to standardize input values
$python kmean.py [input_filename] [output_filename] [numbers_of_group]
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Above the run-comment and file encoding comment.
#----below is Bryan's code----
# for caculate the minnimal distance between every bill and center of group
#
import sys
import numpy as np
import scipy
raw_sub = sc.textFile("/Users/bryanyang/Documents/Data/Ads Prediction/random_submission.csv",1)
raw_sub.cache() ##代表要將資料load到記憶體中
raw_sub.count() ##一樣要執行計算指令後才會load資料
raw_sub = sc.textFile("/Users/bryanyang/Documents/Data/Ads Prediction/random_submission.csv",1)
raw_sub.count()
In [5]:
raw_ratings = sc.textFile('/Users/bryanyang/Documents/Data/Movie Rating/ratings.dat',10) ##分成10份
raw_ratings.setName("raw ratings 10")
raw_ratings.cache()
Out[5]:
raw ratings 10 MappedRDD[6] at textFile at NativeMethodAccessorImpl.java:-2
In [6]:
entries = raw_ratings.count()
print "%s entries in ratings" %entries
In [3]:
raw_ratings = sc.textFile('/Users/bryanyang/Documents/Data/Movie Rating/ratings.dat')
raw_ratings.setName("raw ratings")
raw_ratings.cache()
Out[3]:
raw ratings MappedRDD[3] at textFile at NativeMethodAccessorImpl.java:-2
In [4]:
entries = raw_ratings.count()
In [3]:
raw_ratings = sc.textFile('/Users/bryanyang/Documents/Data/Movie Rating/ratings.dat',10)
raw_ratings.setName("raw ratings")
type(raw_ratings)
Out[3]:
pyspark.rdd.RDD