Created
April 23, 2014 08:31
-
-
Save ilovejs/11207109 to your computer and use it in GitHub Desktop.
Machine learning in r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> m <- matrix(c('a', 'b', 'c', 'd'), ncol = 2) | |
> save(x, y, z, file = "mydata.RData") | |
> load("mydata.RData") | |
//read in | |
> pt_data <- read.csv("pt_data.csv", stringsAsFactors = FALSE) | |
> mydata <- read.csv("mydata.csv", stringsAsFactors = FALSE, | |
header = FALSE) | |
//write | |
> write.csv(pt_data, file = "pt_data.csv") | |
> install.packages("RODBC") | |
> mydb <- odbcConnect("my_dsn") | |
> mydb <- odbcConnect("my_dsn", uid = "my_username" | |
pwd = "my_password") | |
> patient_query <- "select * from patient_data where alive = 1" | |
> patient_data <- sqlQuery(channel = mydb, query = patient_query, | |
stringsAsFactors = FALSE) | |
> odbcClose(mydb) | |
//print out object | |
> str(usedcars) | |
//summary numeric variables | |
> summary(usedcars$year) | |
> summary(usedcars[c("price", "mileage")]) | |
//In statistics, the average is also known as the mean | |
> (36000 + 44000 + 56000) / 3 | |
[1] 45333.33 | |
> mean(c(36000, 44000, 56000)) | |
[1] 45333.33 | |
> median(c(36000, 44000, 56000)) | |
[1] 44000 | |
# 1. Minimum (Min.) | |
# 2. First quartile, or Q1 (1st Qu.) | |
# 3. Median, or Q2 (Median) | |
# 4. Third quartile, or Q3 (3rd Qu.) | |
# 5. Maximum (Max.) | |
//The span between the minimum and maximum value is known as the range. | |
> range(usedcars$price) | |
[1] 3800 21992 | |
> diff(range(usedcars$price)) | |
[1] 18192 | |
# interquartile range (IQR) - The difference between Q1 and Q3 | |
# is of particular interest because it itself is a simple measure of spread. | |
> IQR(usedcars$price) | |
[1] 3909.5 | |
> quantile(usedcars$price) | |
0% 25% 50% 75% 100% | |
3800.0 10995.0 13591.5 14904.5 21992.0 | |
# If we specify an additional probs parameter using a vector denoting cut points, we | |
# can obtain arbitrary quantiles, such as the 1st and 99th percentiles: | |
> quantile(usedcars$price, probs = c(0.01, 0.99)) | |
1% 99% | |
5428.69 20505.00 | |
# using the ?quantile command. | |
> quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20)) | |
0% 20% 40% 60% 80% 100% | |
3800.0 10759.4 12993.8 13992.0 14999.0 21992.0 | |
#Plot | |
> boxplot(usedcars$price, main="Boxplot of Used Car Prices", | |
ylab="Price ($)") | |
> hist(usedcars$price, main = "Histogram of Used Car Prices", | |
xlab = "Price ($)") | |
## It is convention to only allow the whiskers to extend | |
# to a minimum or maximum of 1.5 times the IQR below Q1 or above Q3 | |
IQR for the price variable was 3909 with Q1 of 10995 and Q3 of 14904. | |
An outlier is therefore any value that is | |
less than 10995 - 1.5 * 3905 = 5137.5 or greater than 14904 + 1.5 * 3905 = 20761.5. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment