-
-
Save bigaidream/40fe0f8267a80e7c9cf8 to your computer and use it in GitHub Desktop.
#!/public/spark-0.9.1/bin/pyspark | |
import os | |
import sys | |
# Set the path for spark installation | |
# this is the path where you have built spark using sbt/sbt assembly | |
os.environ['SPARK_HOME'] = "/public/spark-0.9.1" | |
# os.environ['SPARK_HOME'] = "/home/jie/d2/spark-0.9.1" | |
# Append to PYTHONPATH so that pyspark could be found | |
sys.path.append("/public/spark-0.9.1/python") | |
# sys.path.append("/home/jie/d2/spark-0.9.1/python") | |
# Now we are ready to import Spark Modules | |
try: | |
from pyspark import SparkContext | |
from pyspark import SparkConf | |
except ImportError as e: | |
print ("Error importing Spark Modules", e) | |
sys.exit(1) | |
import numpy as np | |
from sklearn.cross_validation import train_test_split, Bootstrap | |
from sklearn.datasets import make_classification | |
from sklearn.metrics import accuracy_score | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn import datasets, svm, pipeline | |
from sklearn.kernel_approximation import RBFSampler | |
from sklearn.linear_model import SGDClassifier | |
if __name__ =='__main__': | |
conf=SparkConf() | |
conf.setMaster("spark://172.18.109.87:7077") | |
# conf.setMaster("local") | |
conf.setAppName("spark_svm") | |
conf.set("spark.executor.memory", "12g") | |
sc = SparkContext(conf=conf) | |
X, y = make_classification(n_samples=10000, n_features=30, n_classes=2) | |
X_train, X_test, y_train, y_test = train_test_split(X, y) | |
samples = sc.parallelize(Bootstrap(y.size)) | |
feature_map_fourier = RBFSampler(gamma=.2, random_state=1) | |
fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier), | |
("svm", SGDClassifier())]) | |
fourier_approx_svm.set_params(feature_map__n_components=700) | |
results = samples.map(lambda (index, _): | |
fourier_approx_svm.fit(X[index], y[index]).score(X_test, y_test)) \ | |
.reduce(lambda x,y: x+y) | |
final_results = results/ len(Bootstrap(y.size)) | |
print(final_results) |
@riturajtiwari, thanks your solution is the best so far. Thanks for sharing.
By the way, I am not able to connect to my remote server.
The code is
import os
import sys
try:
from pyspark import SparkContext
from pyspark import SparkConf
print ("Pyspark sucess")
except ImportError as e:
print ("Error importing Spark Modules", e)
sys.exit(1)
try:
#if name =='main':
conf=SparkConf()
conf.setMaster("spark://10.228.200.251:7077")
print ("connection suceeded with Master")
except:
print("Connection not established")
sys.exit(1)
When i run this code, I get the following error
Pyspark sucess
The system cannot find the path specified.
Connection not established
Process finished with exit code 1
Any pointers to resolve this would be appreciated.
Thank you
@matsya: The path specified error usually occurs when you did not set the 'SPARK_HOME' environment variable properly and you try instantiating the SparkConf(). You should set that to where your remote server spark installation is located.
os.environ['SPARK_HOME'='/usr/spark_installation'
Hi folks.
I am able to import SparkContext, but when I try to set it to 'sc', I am getting the following error:
WindowsError: [Error 2] The system cannot find the file specified. It has got something to do with the subprocess.py file in the new environment's lib directory.
Sorry if what I am asking is too trivial, I am not a computer scientist.
I have a better way:
This should set you up to run and debug.