###Tested with:
- Spark 2.0.0 pre-built for Hadoop 2.7
- Mac OS X 10.11
- Python 3.5.2
Use s3 within pyspark with minimal hassle.
# Install R | |
sudo apt update | |
sudo apt install gdebi libxml2-dev libssl-dev libcurl4-openssl-dev libopenblas-dev r-base r-base-dev | |
# Install RStudio | |
cd ~/Downloads | |
wget https://download1.rstudio.org/desktop/bionic/amd64/rstudio-1.2.5001-amd64.deb | |
sudo gdebi rstudio-1.2.5001-amd64.deb | |
printf '\nexport QT_STYLE_OVERRIDE=gtk\n' | sudo tee -a ~/.profile |
import ipywidgets as widgets | |
import os | |
from pathlib import Path | |
cwd = Path(os.getcwd()) | |
FOLDERLABEL = '-------FOLDERS-------' | |
FILESLABEL = '-------FILES-------' | |
def get_folder_contents(folder): |
#!/bin/sh | |
# pull the listing | |
wget http://mirror.centos.org/centos/6/sclo/x86_64/rh/devtoolset-3/ | |
# yank the hrefs | |
egrep -o 'href=\"(.*\.rpm)"' index.html | cut -d \" -f2 > rpm.list | |
# download them | |
for f in `cat rpm.list`; |
from sklearn.datasets import load_boston | |
import pandas as pd | |
import xgboost as xgb | |
from tensorboard_logger import configure, log_value | |
from sklearn.cross_validation import train_test_split | |
def logspy(env): | |
log_value("train", env.evaluation_result_list[0][1], step=env.iteration) |
######################################## | |
## Title: Spark MLlib Classification Data Prep Script | |
## Language: PySpark | |
## Author: Colby T. Ford, Ph.D. | |
######################################## | |
from pyspark.ml import Pipeline | |
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator, StringIndexer, VectorAssembler | |
label = "dependentvar" |
import numpy as np | |
from sklearn.datasets import load_digits | |
from sklearn.metrics import roc_auc_score | |
import lightgbm as lgbm | |
if __name__ == '__main__': | |
np.random.seed(4242) | |
d = load_digits() |
""" | |
1. Store the image and create an instance of ImagefileProcessHistory with status NOT_STARTED | |
ImageFileProcessHistory | |
- local_path | |
- file_name | |
- status # enum: NOT_STARTED, STARTED, SUCCEEDED, FAILED | |
- bytes_processed | |
- bytes_total | |
""" |
# http://docs.python-requests.org/en/master/api/ | |
import requests | |
class RequestsApi: | |
def __init__(self, base_url, **kwargs): | |
self.base_url = base_url | |
self.session = requests.Session() | |
for arg in kwargs: | |
if isinstance(kwargs[arg], dict): | |
kwargs[arg] = self.__deep_merge(getattr(self.session, arg), kwargs[arg]) |
""" | |
Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book. You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science . | |
Copyright 2016 Ronald J. Nowling | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 |