###Tested with:
- Spark 2.0.0 pre-built for Hadoop 2.7
- Mac OS X 10.11
- Python 3.5.2
Use s3 within pyspark with minimal hassle.
| import ipywidgets as widgets | |
| import os | |
| from pathlib import Path | |
| cwd = Path(os.getcwd()) | |
| FOLDERLABEL = '-------FOLDERS-------' | |
| FILESLABEL = '-------FILES-------' | |
| def get_folder_contents(folder): |
| #!/bin/sh | |
| # pull the listing | |
| wget http://mirror.centos.org/centos/6/sclo/x86_64/rh/devtoolset-3/ | |
| # yank the hrefs | |
| egrep -o 'href=\"(.*\.rpm)"' index.html | cut -d \" -f2 > rpm.list | |
| # download them | |
| for f in `cat rpm.list`; |
| from sklearn.datasets import load_boston | |
| import pandas as pd | |
| import xgboost as xgb | |
| from tensorboard_logger import configure, log_value | |
| from sklearn.cross_validation import train_test_split | |
| def logspy(env): | |
| log_value("train", env.evaluation_result_list[0][1], step=env.iteration) |
| ######################################## | |
| ## Title: Spark MLlib Classification Data Prep Script | |
| ## Language: PySpark | |
| ## Author: Colby T. Ford, Ph.D. | |
| ######################################## | |
| from pyspark.ml import Pipeline | |
| from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator, StringIndexer, VectorAssembler | |
| label = "dependentvar" |
| import numpy as np | |
| from sklearn.datasets import load_digits | |
| from sklearn.metrics import roc_auc_score | |
| import lightgbm as lgbm | |
| if __name__ == '__main__': | |
| np.random.seed(4242) | |
| d = load_digits() |
| """ | |
| 1. Store the image and create an instance of ImagefileProcessHistory with status NOT_STARTED | |
| ImageFileProcessHistory | |
| - local_path | |
| - file_name | |
| - status # enum: NOT_STARTED, STARTED, SUCCEEDED, FAILED | |
| - bytes_processed | |
| - bytes_total | |
| """ |
| # http://docs.python-requests.org/en/master/api/ | |
| import requests | |
| class RequestsApi: | |
| def __init__(self, base_url, **kwargs): | |
| self.base_url = base_url | |
| self.session = requests.Session() | |
| for arg in kwargs: | |
| if isinstance(kwargs[arg], dict): | |
| kwargs[arg] = self.__deep_merge(getattr(self.session, arg), kwargs[arg]) |
| """ | |
| Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book. You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science . | |
| Copyright 2016 Ronald J. Nowling | |
| Licensed under the Apache License, Version 2.0 (the "License"); | |
| you may not use this file except in compliance with the License. | |
| You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 |
| rdd = sc.parallelize( | |
| [ | |
| (0., 1.), | |
| (0., 0.), | |
| (0., 0.), | |
| (1., 1.), | |
| (1.,0.), | |
| (1.,0.), | |
| (1.,1.), | |
| (1.,1.) |