vidit0210’s gists

vidit0210 / ShufflingData

Created February 24, 2020 14:47

vidit0210 / TargetColToFirstCol

Created February 24, 2020 14:49

	cols = list(train_df)
	cols.insert(0, cols.pop(cols.index('OUTPUT_LABEL')))
	train_df = train_df.loc[:, cols]

vidit0210 / Write-Download-from-csv

Created February 25, 2020 06:07

	# Write and Reading from S3 is just as easy
	# files are referred as objects in S3.
	# file name is referred as key name in S3
	# Files stored in S3 are automatically replicated across 3 different availability zones
	# in the region where the bucket was created.

	# http://boto3.readthedocs.io/en/latest/guide/s3.html
	def write_to_s3(filename, bucket, key):
	with open(filename,'rb') as f: # Read in binary mode
	return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

vidit0210 / Write-Read-RECORDIO

Created February 25, 2020 06:13

	def write_recordio_file (filename, x, y=None):
	with open(filename, 'wb') as f:
	smac.write_numpy_to_dense_tensor(f, x, y)

	def read_recordio_file (filename, recordsToPrint = 10):
	with open(filename, 'rb') as f:
	record = smac.read_records(f)
	for i, r in enumerate(record):
	if i >= recordsToPrint:
	break

vidit0210 / Tree-Boosting-Bagging-CV

Created February 29, 2020 14:38

	# Compute the array containing the 10-folds CV MSEs
	MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
	scoring='neg_mean_squared_error',
	n_jobs=-1)

	# Compute the 10-folds CV RMSE
	RMSE_CV = (MSE_CV_scores.mean())**(1/2)

	# Print RMSE_CV
	print('CV RMSE: {:.2f}'.format(RMSE_CV))

vidit0210 / XG-Boost-DataCamp

Created February 29, 2020 20:18

	----
	Starting
	----
	# Import xgboost
	import xgboost as xgb

	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the training and test sets

vidit0210 / KIDNEY CASE STUDY

Created February 29, 2020 20:55

	DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

	# Import necessary modules
	from sklearn_pandas import DataFrameMapper
	from sklearn_pandas import CategoricalImputer

	# Check number of nulls in each feature column
	nulls_per_column = X.isnull().sum()
	print(nulls_per_column)

vidit0210 / Preprocessing in Machine Learning-DataCamp

Created March 1, 2020 10:29

	---
	Missing data - rows
	---
	# Check how many values are missing in the category_desc column
	print(volunteer["category_desc"].isnull().sum())

	# Subset the volunteer dataset
	volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

	# Print out the shape of the subset

vidit0210 / Feature Engineering for Machine Learning in Python-DataCamp

Created March 1, 2020 17:02

Feature Engineering for Machine Learning in Python-dataCamp

	----
	Selecting specific data types
	----
	# Create subset of only the numeric columns
	so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

	# Print the column names contained in so_survey_df_num
	print(so_numeric_df.columns)
	---
	One-hot encoding and dummy variables

vidit0210 / Kaggle Competition Data Camp.

Created March 2, 2020 13:49

	----
	Data - https://www.kaggle.com/c/demand-forecasting-kernels-only/data
	DEMAND FORECASTING CHALLENGE
	----
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor

	# Read the train data
	train = pd.read_csv('train.csv')

Vidit Shah vidit0210