Last active
June 12, 2017 08:11
-
-
Save yunho0130/e2e034e6cf0698576ebc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 2016-03-28 Yunho Maeng | |
# Assignment 1 : Wine Quality | |
# if you didn't install package, you can use below code | |
# install.packages("ggplot2"); | |
# install.packages("dplyr"); | |
# install.packages("gridExtra") | |
# install.packages("GGally") | |
# install.packages("reshape2") | |
# install.packages("doBy") | |
# graph | |
library(ggplot2); | |
library(dplyr); | |
library (gridExtra); | |
library(reshape2); | |
library(doBy); | |
# 1. Load Red wine & White wine data to R | |
redWineData <- read.csv( | |
file="/Users/Yunho/VM Ware/R/winequality/winequality-red.csv", | |
header = TRUE, | |
sep = ";" | |
) | |
whiteWineData <- read.csv( | |
file="/Users/Yunho/VM Ware/R/winequality/winequality-white.csv", | |
header = TRUE, | |
sep = ";") | |
# 레드와인과 화이트 와인을 "Red"와 "White"로 구분하는 열을 추가하고 그 열의 이름을 "type"으로 지정. | |
# 그리고 하나의 데이터 프레임으로 합치고 그 이름을 tatalWineData로 지정. | |
redTemp2 <- mutate(redWineData, "Red") | |
colnames(redTemp2)[13] <- "type" | |
whiteTemp2 <- mutate(whiteWineData, "White") | |
colnames(whiteTemp2)[13] <- "type" | |
totalWineData <- NULL | |
totalWineData <- rbind(redTemp2,whiteTemp2) | |
# 2. quality를 제외한 다른 모든 변수들에 관해서 Red wine과 White wine이 어떤 차이가 나는 지를 | |
# 잘 보여줄 수 있는 변수를 찾아보아라. | |
summaryOfRedWine <- summary(redWineData) | |
summaryOfWhiteWine <- summary(whiteWineData) | |
t.test(redWineData$fixed.acidity, whiteWineData$fixed.acidity) | |
t.test(redWineData$volatile.acidity, whiteWineData$volatile.acidity) | |
t.test(redWineData$citric.acid, whiteWineData$citric.acid) | |
t.test(redWineData$residual.sugar, whiteWineData$residual.sugar) | |
t.test(redWineData$chlorides, whiteWineData$chlorides) | |
t.test(redWineData$free.sulfur.dioxide, whiteWineData$free.sulfur.dioxide) | |
t.test(redWineData$total.sulfur.dioxide, whiteWineData$total.sulfur.dioxide) | |
t.test(redWineData$density, whiteWineData$density) | |
t.test(redWineData$pH, whiteWineData$pH) | |
t.test(redWineData$sulphates, whiteWineData$sulphates) | |
t.test(redWineData$alcohol, whiteWineData$alcohol) | |
# T-Test에서는 모든 변수가 차이가 난다고 나오기 때문에 "잘" 보여줄 수 있는 변수를 찾기 어려워서 다시 계산시작. | |
summaryOfRedWine | |
summaryOfWhiteWine | |
# 각 집단의 일반통계 수치를 확인하여 "눈에 띄는" 차이를 mean 값을 통해 파악함 | |
# 답: volatile.acidity, residual.sugar, free.sulfur.dioxide, total.sulfur.dioxide, sulphates | |
# 3. Red wine의 quality 변수의 값은 3부터 8까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, | |
# 7,8은 High의 범주 값을 갖는 qualityGroup 변수를 추가하라. | |
# 그리고, qualityGroup에 따른 wine 성분의 차이를 잘 나타내는 변수를 찾아보아라. | |
# Red 와인의 등급설정 | |
# 등급행 추가 | |
redWineWithGrade <- redWineData | |
redWineWithGrade <- mutate(redWineData, "Grade") | |
colnames(redWineWithGrade)[13] <- "qualityGroup" | |
# quality에서 값을 복사해서 등급 구분 | |
redWineWithGrade | |
redWineWithGrade$qualityGroup <- gsub(3, "Low", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(4, "Low", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(5, "Mid", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(6, "Mid", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(7, "High", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(8, "High", redWineWithGrade$quality) | |
redWineWithGrade$qualityGroup <- gsub(3, "Low", redWineWithGrade$qualityGroup) | |
redWineWithGrade$qualityGroup <- gsub(4, "Low", redWineWithGrade$qualityGroup) | |
redWineWithGrade$qualityGroup <- gsub(5, "Mid", redWineWithGrade$qualityGroup) | |
redWineWithGrade$qualityGroup <- gsub(6, "Mid", redWineWithGrade$qualityGroup) | |
redWineWithGrade$qualityGroup <- gsub(7, "High", redWineWithGrade$qualityGroup) | |
redWineWithGrade$qualityGroup <- gsub(8, "High", redWineWithGrade$qualityGroup) | |
# 그룹에 따른 성분차이 | |
groupByRedWineSummary <- summaryBy(fixed.acidity + volatile.acidity + citric.acid | |
+ residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density | |
+ pH + sulphates + alcohol~ qualityGroup, redWineWithGrade) | |
# 데이터 크기가 작아졌으므로 엑셀로 export | |
write.csv( | |
groupByRedWineSummary, | |
"/Users/Yunho/VM Ware/R/winequality/groupByQualitySummary-red.csv", | |
row.names = TRUE | |
) | |
# 답: total.sulfur.dioxide, alcohol, fixed.acidity 값이 낮을 수록 와인의 품질이 높다. | |
# 4. White wine은 quality 변수가 3부터 9까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8,9는 | |
# High의 범주 값을 갖는 qualityGroup 변수를 추가하라. | |
# 그리고 Red wine의 High 와인과 White wine의 High 와인의 성분의 차이를 잘 나타낼 수 있는 변수를 찾아 보아라. | |
# White 와인의 등급설정 | |
# 등급행 추가 | |
whiteWineWithGrade <- whiteWineData | |
whiteWineWithGrade <- mutate(whiteWineData, "Grade") | |
colnames(whiteWineWithGrade)[13] <- "qualityGroup" | |
# quality에서 값을 복사해서 등급 구분 | |
whiteWineWithGrade | |
whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$quality) | |
whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$qualityGroup) | |
whiteWineWithGrade | |
# 그룹에 따른 성분차이 | |
groupByWhiteWineSummary <- summaryBy(fixed.acidity + volatile.acidity + citric.acid | |
+ residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density | |
+ pH + sulphates + alcohol~ qualityGroup, whiteWineWithGrade) | |
groupByWhiteWineSummary | |
# 데이터 크기가 작아졌으므로 엑셀로 export | |
write.csv( | |
groupByWhiteWineSummary, | |
"/Users/Yunho/VM Ware/R/winequality/groupByQualitySummary-white.csv", | |
row.names = TRUE | |
) | |
# 답: 고급레드와인과 고급화이트와인간 차이를 잘 나타내주는 성분은 | |
# total.sulfur.dioxide.mean, free.sulfur.dioxide.mean 임 | |
## 망한 참조 코드들 | |
# ,pH,density | |
# | |
# ggplot( | |
# aes(x=alcohol, y=mean(alcohol)), | |
# data = totalWineData)+ | |
# geom_bar(aes(color=type),stat='summary',fun.y=mean)+ | |
# ggtitle('Redwine & Whitewine comparison') | |
# ggplot( | |
# aes(x=density, sulphates), | |
# data = totalWineData) | |
# + | |
# geom_bar(aes(color=type),stat='summary',fun.y=mean)+ | |
# ggtitle('Redwine & Whitewine comparison2') | |
# | |
# q1<-ggplot(aes(x=pH), | |
# data = subset(totalWineData,type %in% c("White")))+ | |
# geom_histogram(color =I('black'),fill = I('#999999'))+ | |
# ggtitle('pH distribution for White wine')+ | |
# data = subset(totalWineData,type %in% c("Red"))+ | |
# geom_histogram(color =I('black'),fill = I('#999999'))+ | |
# ggtitle('pH distribution for White wine') | |
# | |
# 각각을 빼서 차이를 보려고 했지만 데이터 사이즈가 달라서 실패 | |
# betweenRedandWihteWine = redWineData - whiteWineData | |
# q2<-ggplot(aes(x=free.sulfur.dioxide), | |
# data = subset(totalWineData,type %in% c("White")))+ | |
# geom_histogram(color =I('black'),fill = I('#099009'))+ | |
# ggtitle('Free SO2 distribution for White wine') | |
# q3<-ggplot(aes(x=total.sulfur.dioxide), | |
# data = subset(totalWineData,type %in% c("White")))+ | |
# geom_histogram(color =I('black'),fill = I('#099009'))+ | |
# ggtitle('Total SO2 distribution for White wine') | |
# q4<-ggplot(aes(x=alcohol), | |
# data = subset(totalWineData,type %in% c("White")))+ | |
# geom_histogram(color =I('black'),fill = I('#099009'))+ | |
# ggtitle('Alcohol distribution for White wine') | |
# | |
# # # grid.arrange(q1,q2,q3,q4,ncol=2) | |
# export PDF | |
# install.packages("knitr"); | |
# install.packages("framed") | |
# library(framed) | |
# knitr::stitch('/Users/Yunho/VM Ware/R/160328_HW1_wineQuality_yunhomaeng.R') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment