Created
July 1, 2016 07:08
-
-
Save iCHAIT/066e9295caf06d97afe4e4482e933858 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install packages | |
install.packages('data.table') | |
install.packages('ggplot2') | |
# import library | |
library(data.table) | |
library(ggplot2) | |
# Assigning Vectors | |
x= c(0,1,2,3,4) | |
y=c(1:5) | |
#using a function in R | |
square_root = function(x = 2){sqrt(x)} | |
# Creating a data table | |
dt = data.table(name=c('Himanshu','Abhishek'),age = c(22,23)) | |
# Viewing a data table | |
#Reading And Writing Tables, removing row names | |
# attaching the diamond table included in ggplot package | |
attach(diamonds) | |
diamonds = data.table(diamonds) | |
# show all colnames of diamonds | |
colnames(diamonds) | |
# showing number of rows, columns and dimensions of a table | |
nrow(diamonds) | |
# showing number of rows, columns and dimensions of a table | |
ncol(diamonds) | |
# for dimension | |
dim(diamonds) | |
# changing column names in a data table | |
setnames(diamonds,c('carat','cut'),c('Carat','Cut')) | |
colnames(diamonds) | |
setnames(diamonds,c('Carat','Cut'),c('carat','cut')) | |
# Find the number and types unique cuts and assign it to a variable | |
unique_cuts = unique(diamonds$cut) | |
unique_cuts | |
len_unique_cuts = length(unique(diamonds$cut)) | |
len_unique_cuts | |
# Find the combination of unique cuts and clarities | |
unique_cuts_and_colors = unique(diamonds, by=c('cut','clarity')) | |
unique_cuts_and_colors[,c('cut','color'),with = F] | |
unique_cuts_and_colors[cut=='Ideal' ,c('cut','color', 'clarity'),with=F] | |
#filtering - looking at values of a Ideal Cut and E color | |
IdealCut_EColor_diamonds = diamonds[cut=='Ideal' & color == 'E'] | |
IdealCut_EColor_diamonds | |
# Grouping - Find number of instances of every cut and mean price of every cut and list the cut with highest price | |
grouping_on_cut = diamonds[,list(Total_instance = .N,mean_price = mean(price)), | |
by=c('cut')] | |
# Defining a new column in R | |
price_cat <- diamonds[cut == "Ideal" & price < 3457.42, price_category := "Cheap"] | |
View(price_cat) | |
# Way to do it | |
max_mean_price = max(grouping_on_cut$mean_price) | |
cut_highest_price = unique(grouping_on_cut[mean_price == max_mean_price]$cut) | |
# Excercise | |
# Find the count of instances and mean of depth of every cut and color combination | |
# Find and print the cut and color combination with least depth and its value | |
# Find the combination with highest mean depth among the Premium Cut | |
grouping_on_cut_color = diamonds[,list(Total_instance = .N,mean_depth = mean(depth)), | |
by=c('cut','color')] | |
min_depth = min(grouping_on_cut_color$mean_depth) | |
grouping_on_cut_color[mean_depth == min_depth,c('cut','color'),with=F] | |
max_depth_Premium <- max(grouping_on_cut_color[cut == 'Premium']$mean_depth) | |
grouping_on_cut_color[mean_depth == max_depth_Premium,c('cut','color'),with=F] | |
nrow(diamonds[cut == "Ideal" & price < 350]) | |
# Merging in a Data Table | |
cuts = unique(diamonds$cut) | |
cutQuality = data.table(cut=cuts,quality = c("Q1","Q2","Q3","Q4","Q5")) | |
cutPriceType = data.table(cut=cuts[1:4],priceType = c("P1","P2","P3","P4")) | |
# left join | |
diamondsWithQuality = merge(diamonds,cutQuality,all.x=T,by=c('cut')) | |
diamondsWithQuality[,price_category := NULL] | |
diamondsWithPrice = merge(diamonds,cutPriceType,all.x=T,by=c('cut')) | |
naPriceType = diamondsWithPrice[is.na(priceType)] | |
diamondsWithPriceOnlyExisting = merge(diamonds,cutPriceType,by=c('cut')) | |
# Cartesian Join - Create all combination of price and color | |
cutTable = data.table(cut=unique(diamonds$cut)) | |
cutTable[,key:=1] | |
colorTable = data.table(color=unique(diamonds$color)) | |
colorTable[,key:=1] | |
# Allow.cartesian = T | |
cutColorCombo = merge(cutTable,colorTable,allow.cartesian = T,by=c('key')) | |
# deleting a column | |
cutColorCombo[,key:=NULL] | |
#Excercise 3 | |
# Does the data set contain all possible combinations of these | |
# cut,colort,clarity types? | |
# If no, list the values of missing combinations in the diamonds set. | |
# Which combination(s) has/have the most records? | |
# Which one(s) has/have the least? Print them in your own way | |
cutTable = data.table(cut=unique(diamonds$cut)) | |
cutTable[,key:=1] | |
colorTable = data.table(color=unique(diamonds$color)) | |
colorTable[,key:=1] | |
clarityTable = data.table(clarity=unique(diamonds$clarity)) | |
clarityTable[,key := 1] | |
cutColorTable <- merge(cutTable,colorTable, allow.cartesian = T, by = 'key') | |
cutColorClarityTable <- merge(cutColorTable,clarityTable, allow.cartesian = T, by = 'key') | |
diamonds1 <- unique(diamonds, by = c("cut", 'color', 'clarity')) | |
diamonds1[,present := 1] | |
final <- merge(cutColorClarityTable, diamonds1, all.x = T, by = c("cut", 'color', 'clarity')) | |
View(final) | |
View(final[is.na(present)]) | |
iris = data.table(iris) | |
View(iris) | |
summary(iris) | |
str(iris) | |
# Plotting | |
qplot(Sepal.Length, Petal.Length, color = Species, data = iris) | |
qplot(Sepal.Length, color = Species, data = iris) | |
# Modelling | |
# use the copy function | |
FeatureSetTrain = copy(iris) | |
FeatureSetTrain = FeatureSetTrain[,c('Sepal.Length','Petal.Length'),with=F] | |
train <- FeatureSetTrain[1:100] | |
Target = FeatureSetTrain$Sepal.Length | |
FeatureSetTrain = data.frame(FeatureSetTrain) | |
fit1 <- lm(Sepal.Length ~ Petal.Length , data = FeatureSetTrain) | |
print(coef(fit1)) | |
coeff=data.table(coef(fit1)[[1]]) | |
res = predict(fit1,FeatureSetTrain) | |
inp = data.table(Sepal.Length=Target) | |
res = data.table(Sepal.Length.Pred = res) | |
View(inp - res) | |
fin_res = cbind(inp,res) | |
View(fin_res) | |
FinalRes = cbind(iris,fin_res) | |
View(FinalRes) | |
# MAE By Mean and ME By Mean Analysis | |
# Excercise 4 | |
# Calculate the Mean Error, Mean Absolute Error and Mean Sepal Length for each species type | |
# Then calculate Mean Error By Mean and Mean Absolute Error by Mean for each species type | |
m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3) | |
View(m) | |
# See help of apply function - 1 represents rows and 2 represents colulmns | |
apply(m, 2, function(x) length(x[x<0])) | |
# sapply and lapply | |
# sapply | |
sapply(1:3, function(x) x^2) | |
# lapply, very similar function but returns list rather than vector | |
lapply(1:3, function(x) x^2) | |
rbind(lapply(iris$Sepal.Length, function(x) x^3)) | |
rbind(lapply(iris$Sepal.Length, function(x) iris[,xyz := x^2])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment