benmarwick · December 17, 2015 23:59
diff --git a/SP_13_ARCHY_299_Lithics.R b/SP_13_ARCHY_299_Lithics.R
 # R script for plotting of basic lithic data from Gua Mo'o hono, Sulawesi

 #### get data
 # this block will connect to the internet to access our google sheet
 # then download it directly into a dataframe ready to work on
 # If this is the first time, a little preparation is necessary:
 # In google sheet: File -> Publish to web -> publish all sheets; 
 # Get link to published data -> CSV, all sheets, copy-paste link here...

 # make a list of the links to all our sheets...
 gid <- 0:7
 goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=",i,"&output=csv"))

 # access data from our links...
 install.packages("RCurl") # only need to do this once per computer
 require(RCurl)
 options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
 myCSVs <- lapply(goog_sheets, function(i) getURL(i))
 data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE))

 # We now have a list object containing our data sheets, let's access 
 # each item of the list so we can work with them
 basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit
 complete_flake_mass <-  data_sheets[[2]] # and so on...
 broken_flake_mass <-  data_sheets[[3]] 
 core_mass <-  data_sheets[[4]] 
 retouch_GIUR <-  data_sheets[[5]] 
 retouch_dims <-  data_sheets[[6]] 
 retouch_notches <-  data_sheets[[7]] 

 ##### Let's look at the basic data and make a stacked bar plot

 # subset just the main counts
 basic_data <- na.omit(basic_data[, 2:7])
 # change column names to be more general
 names(basic_data) <- c("Spit",  "non.artefacts", "cores", "complete.flakes",  "broken.flakes", "debris")
 # reshape the data from wide to long
 library(reshape2)
 basic_data_m <- melt(basic_data, id.var = "Spit")

 # plot
 library(ggplot2)
 ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + 
  geom_bar( stat="identity") +
  xlab("Excavtion unit") +
  ylab("count") +
  labs(fill = "artefact class")

 # let's remove non-artefact and debris
 basic_data_m <- basic_data_m[ !(basic_data_m[["variable"]] %in% c("debris", "non.artefacts")), ]
 # plot just the remaining types
 ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + 
  geom_bar( stat="identity") +
  xlab("Excavtion unit") +
  ylab("count") +
  labs(fill = "artefact class")

 # Are there any major changes in, say, the discard rates
 # of complete flakes? In addition to visual insepction 
 # of the plot, we can use a statistical method
 # called change point analysis
 install.packages("ecp")
 library(ecp)
 x1 <- as.matrix(basic_data[["complete.flakes"]])
 y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1)
 y$estimates; y$cluster.number
 # suggests a change in discard rates at spit 14

 # a bayesian approach to change point analysis
 library(bcp)
 bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
 plot(bcp1)
 which(bcp1$posterior.prob > 0.6)
 # indicates a few more spits (16, 18, etc.) where change in discard rates has occured

 # how about cores?
 x1 <- as.matrix(basic_data[["cores"]])
 bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
 plot(bcp1)
 which(bcp1$posterior.prob > 0.8)
 # yes about spit 18

 #### Now let's look at the mass data
 # let's explore the complete flakes...

 # make a table of Tukey's five number 
 # summary (minimum, lower-hinge, median, upper-hinge, maximum)
 cf <- do.call(data.frame, aggregate(data = complete_flake_mass,  CFLA..g. ~ factor(Spit), fivenum))
 names(cf) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
 # view the table 
 cf
 # the median for most spits is less than 1 g

 # let's plot to visualise that table
 # each dot is a flake
 ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
  geom_point() +
  ylab("mass (g)") +
  xlab("Spit")

 # if we take the log values of the mass we can 
 # see the spread better and detect trends better
 # adding a smoother line helps too
 ggplot(complete_flake_mass, aes(factor(Spit), log(CFLA..g.))) +
  geom_point() +
  ylab("mass (g)") +
  xlab("Spit") +
  geom_smooth(aes(group = 1))
 # seems like flakes in spits 1-10 
 # are heavier than the others

 # plot summary stats of those data
 # we'll trim the y-axis to minimise the effect 
 # of outliers
 ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
  geom_boxplot() + # geom_violin() is nice too
  ylab("mass (g)") +
  xlab("Spit") +
  ylim(0,2) # this adjusts the y-axis limits
 # looks like spits 6-9 are bigger than
 # other spits

 # So the exploratory data anlysis has suggested
 # that the upper spits might be bigger than the 
 # lower spits. We can test this with ANOVA if the 
 # varience of each group (Spit) is the same. 
 # Let's check that:
 complete_flake_mass$Spit <- as.factor(complete_flake_mass$Spit)
 bartlett.test(CFLA..g. ~ Spit, data = complete_flake_mass[-(1:4),])
 # p < 0.05, so varience is not the same, we should not
 # use ANOVA because of its assumptions has been violated

 # Instead we use K-sample permutation test, which is good for unequal variances

 library(coin)
 oneway_test(as.numeric(CFLA..g.) ~ Spit, data = complete_flake_mass, distribution=approximate(B=10000))

 # with a p value greater than 0.05 we have an indication that
 # there are probably no significant non-random differences in 
 # complete flake masses between spits


 # consider spread of complete flake masses by spit
 cf <-  na.omit(aggregate(data = complete_flake_mass,  CFLA..g. ~ factor(Spit), sd))
 names(cf) <- c("Spit", "sd")
 # view the table 
 cf
 # plot it
 ggplot(cf, aes(Spit, sd)) +
  geom_point()
 # seems like spit 19 has a very high spread of masses relative to the others



 ##### repeat for core flake mass

 # make a table of Tukey's five number 
 # summary (minimum, lower-hinge, median, upper-hinge, maximum)
 co <- do.call(data.frame, aggregate(data = core_mass,  Core..g. ~ factor(Spit), fivenum))
 names(co) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
 # view the table 
 co
 # the median for most spits is ~3 g

 # let's plot to visualise that table
 # each dot is a flake
 ggplot(core_mass, aes(factor(Spit),  Core..g.)) +
  geom_point() +
  ylab("mass (g)") +
  xlab("Spit")

 # if we take the log values of the mass we can 
 # see the spread better and detect trends better
 # adding a smoother line helps too
 ggplot(core_mass, aes(factor(Spit),  log(Core..g.))) +
  geom_point() +
  ylab("mass (g)") +
  xlab("Spit") +
  geom_smooth(aes(group = 1))
 # seems like flakes in spits 27-28 
 # are heavier than the others

 # plot summary stats of those data
 # we'll trim the y-axis to minimise the effect 
 # of outliers
 ggplot(core_mass, aes(factor(Spit),  Core..g.)) +
  geom_boxplot() + # geom_violin() is nice too
  ylab("mass (g)") +
  xlab("Spit") +
  ylim(0,5) # this adjusts the y-axis limits


 # So the exploratory data anlysis has suggested
 # that the upper spits might be bigger than the 
 # lower spits. We can test this with ANOVA if the 
 # varience of each group (Spit) is the same. 
 # Let's check that:
 core_mass$Spit <- as.factor(core_mass$Spit)
 bartlett.test(Core..g. ~ Spit, data = core_mass[-c(1:8,25),])
 # p < 0.05, so varience is not the same, we should not
 # use ANOVA because of its assumptions has been violated

 # Instead we use K-sample permutation test, which is good for unequal variances

 library(coin)
 oneway_test(as.numeric(Core..g.) ~ Spit, data = core_mass, distribution=approximate(B=10000))

 # with a p value greater than 0.05 we have an indication that
 # there are probably no significant non-random differences in 
 # complete flake masses between spits



 # consider spread of complete flake masses by spit
 co <-  na.omit(aggregate(data = core_mass,  Core..g. ~ factor(Spit), sd))
 names(co) <- c("Spit", "sd")
 # view the table 
 co
 # plot it
 ggplot(co, aes(Spit, sd)) +
  geom_point()
 # seems like spits 9 and 29 have a very high spread of masses relative to the others


 # investigate changes in ratios of complete flakes and broken flakes
 ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.broken.flakes))
 ratio <- data.frame(Spit = basic_data$Spit[1:32], ratio = ratio)
 library(ggplot2)
 ggplot(ratio, aes(Spit, ratio)) +
  geom_point() +
  geom_smooth() +
  ylab("ratio of complete flakes to broken flakes")
 # correlation
 with(ratio, cor.test(Spit, ratio))

 # investigate changes in ratios of complete flakes and cores
 ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.cores))
 ratio <- data.frame(Spit = basic_data$Spit[1:31], ratio = ratio)
 library(ggplot2)
 ggplot(ratio, aes(Spit, ratio)) +
  geom_bar(stat="identity") +
  geom_smooth() +
  ylab("ratio of complete flakes to cores")
 # correlation
 with(ratio, cor.test(Spit, ratio))

 ##################################################################
 #
 # GIUR data, reformatted... 
 require(RCurl)
 options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
 GIUR <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=6&output=csv"
 GIUR <- read.csv(textConnection(getURL(GIUR)), stringsAsFactors = FALSE)

 # just get newly formatted GIUR data
 GIURd <- GIUR[1:22,-1]
 # extract T and t
 Tval <- GIURd[seq(1, length(GIURd), by=2)] 
 tval <- GIURd[seq(2, length(GIURd), by=2)] 

 # give matching row names (no necessary)
 # names(Tval) <- as.numeric(unlist(regmatches(names(Tval), gregexpr('\\(?[0-9,.]+', names(Tval)  ))))
 # names(tval) <- as.numeric(unlist(regmatches(names(tval), gregexpr('\\(?[0-9,.]+', names(tval)  ))))

 # calculate GIUR per zone, per artefact
 # see here for details: http://www.sciencedirect.com.offcampus.lib.washington.edu/science/article/pii/S0305440309000284 
 GIURz <- sapply(1:10, function(i) as.numeric(tval[,i]) / as.numeric(Tval[,i]) )

 # calculate total GIUR for each artefact
 GIURz[is.na(GIURz)] <- 0 # replace NAs with zeros
 # look at distribution across zones
 library(ggplot2); library(reshape2)
 mlt <- melt(GIURz)
 ggplot(mlt, aes(Var2, value))+
  geom_bar(stat="identity")
 # zones 4 and 5 get the most retouching...

 # identical to above, sanity check
 mlt_agg <- aggregate(value ~ Var2, mlt[,2:3], sum)
 ggplot(mlt_agg, aes(Var2, value))+
  geom_bar(stat="identity")

 # summarise by spit
 GIURt <- rowSums(GIURz)/10
 # get spit numbers back in there
 spit <- as.numeric(unlist(regmatches(GIUR[1:22,1], gregexpr('\\(?[0-9,.]+', GIUR[1:22,1]  ))))  
 GIURT <- data.frame(spit = spit, GIUR = GIURt)
 # plot
 library(ggplot2)
 ggplot(GIURT, aes(spit, GIUR)) +
  geom_point() +
  geom_smooth() + 
  ylim(0,1) 
 # another kind of plot...
 ggplot(GIURT, aes(as.factor(spit), GIUR)) +
  geom_boxplot() + 
  ylim(0,1) 

 # now get average GIUR per spit
 GIURav <- aggregate(GIUR ~ spit, GIURT, mean)
 # plot
 ggplot(GIURav, aes(spit, GIUR)) +
  geom_point() +
  geom_smooth() + 
  ylim(0,1) 

 #############################################################
 # metrics of retouched flakes
 require(RCurl)
 options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
 met <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=5&output=csv"
 met <- read.csv(textConnection(getURL(met)), stringsAsFactors = FALSE)
 met[is.na(met)] <- 0 # replace NAs with zeros
 met <- met[,1:5]; met[met=="n/a" | met==""] <- 0; met$Max..Dimension.mm. <- as.numeric(met$Max..Dimension.mm.)
 library(data.table)
 met <- data.table(met)

 # EL = length + maximum width + maximum dimension
 # EL/M, M = mass in g, we'll call it EL_M because slashes are special characters in R
 # http://www.academia.edu/235080/A_method_for_estimating_edge_length_from_flake_dimensions_use_and_implications_for_technological_change_in_the_MSA
 met$EL <-  with(met, Length..mm. +  Max..Dimension.mm. + Max..Width..mm.)
 met$EL_M <- with(met, EL / Mass..g.)

 # average mass, length, etc per spit
 met_av <- met[,lapply(.SD,mean),by="SPIT"]

 # look for correlations between GIUR and size
 merg <- merge( GIURav,  met_av, by.x = "spit", by.y = "SPIT")
 with(merg, cor.test( GIUR, Mass..g.))
 with(merg, cor.test( GIUR,  EL_M))

 # look for correlation between GIUR and discard rates
 cf <- na.omit(basic_data[,c("Spit", "Number.of.complete.flakes")])
 mergcf <- merge( merg,  cf, by.x = "spit", by.y = "Spit")
 with(mergcf, cor.test( GIUR,  Number.of.complete.flakes))

 # Correlation is the most commonly 
 # used statistical technique to investigate 
 # any kind of data. A correlation is a
 # single number that describes the degree 
 # of relationship between two variables, 
 # which we can label X and Y, just for 
 # example. Correlations can suggest possible 
 # causal relationships but the statistical 
 # test alone is not sufficient to demonstrate 
 # the relationship, you also need explain it 
 # with some logic and details of the likely 
 # causal connection. The answer to the 
 # question of how much are the two variables
 # correlated comes from two numbers: 
 #
 # r, or the coefficient of correlation. r 
 # takes on a value in the range from -1 to 1. 
 # This measure will show whether the correlation
 # between variables is strong (close to 1 or -1),
 # weak or non-existent (zero or nearly zero) 
 # and indicate the direction of this correlation. 
 # A positive value for r means that larger 
 # X values go along with larger Y values while 
 # a negative r value means when X values get 
 # larger, the Y values get smaller. 
 #                                                                                                                                                                                                                                                                       # p or the probability value. This value 
 # p tells you the probability of the r-value 
 # in a collection of random data in which you'd 
 # have no reason to suspect any relationship. A 
 # p-value of 0.05 indicates a 5% chance that 
 # results you are seeing would have come up in 
 # a random dataset, so you can say with a 95% 
 # probability of being correct that you are 
 # observing a real relationship in your data. 
 # Scholarly convention tends to hold p values
 # of less than 0.05 as indicators of a 
 # statistically signficant relationship. Above 
 # 0.05 is not significant. 
 #
 # Note that the size of the p-value in your 
 # correlation analysis says nothing about the 
 # strength of the relationship between your 
 # two variables - it is possible to have a 
 # highly significant result (very small p-value, 
 # eg. p = 0.001) for a miniscule effect (ie. 
 # an r value close to zero) that might not 
 # be of any real-world importance or archaeological 
 # meaning. 
 #
 # You need to use your common sense 
 # when interpreting your correlation data; don't 
 # just mindlessly copy and paste them and get 
 # excited when r is big or p-values 
 # are small. Ensure that you're only 
 # getting excited about data that make 
 # sense in the real world.                                                                                                                                                                                                                                                        
 #
 # We will calculate the correlation coeffient
 # to show on the plot and get the probability 
 # value of the null hypothesis (that the 
 # correlation is due to random 
 # processes that we don't care about). This value will 
 # help us test the hypothesis that 
 # the overall slope of the linear 
 # regression in 0 (ie. there is no 
 # relationship between the two variables). 
 # A line with slope 0 is horizontal, 
 # which means that Y does not depend on X at all.   

 #
 # Remember that a correlation is considered to be interesting
 # only when the p value is less than 0.05. If it's greater
 # than 0.05, then the correlation is most likely due to
 # chance and not suggestive of anything interesting. 
 #
 # see for something to cite in your written work:
 # http://uwashington.worldcat.org.offcampus.lib.washington.edu/oclc/701053523
diff --git a/WI_14_ARCHY_299_Lithics.R b/WI_14_ARCHY_299_Lithics.R
 # R script for plotting of basic lithic data from Talimbue, Sulawesi

 #### get data
 # this block will connect to the internet to access our google sheet
 # then download it directly into a dataframe ready to work on
 # If this is the first time, a little preparation is necessary:
 # In google sheet: File -> Publish to web -> publish all sheets; 
 # Get link to published data -> CSV, all sheets, copy-paste link here...


 # make a list of the links to all our sheets...
 gid <- c(0,3,6,7)
 goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldFFfcXYtNURINWtlOGtFbFhwX3hXTGc&single=true&gid=",i,"&output=csv"))

 # access data from our links...
 # install.packages("RCurl") # only need to do this once per computer
 require(RCurl)
 options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
 myCSVs <- lapply(goog_sheets, function(i) getURL(i))
 data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE))

 # We now have a list object containing our data sheets, let's access 
 # each item of the list so we can work with them
 basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit
 complete_flakes <-  data_sheets[[2]] # and so on...
 spit_depths <- data_sheets[[3]]
 interpolated_ages <- data.frame(lapply(data_sheets[[4]], as.numeric))

 #### Before we get into the lithic data, let's look at the radiocarbon dates 
 # so we can determine the ages of interesting changes in the lithic sequence
 # first run all the lines at the end of this file  in the 'end matter' 
 # to load the depth-age data

 # round depths to ensure a match with depths in the other table
 spit_depths$depth_m <- as.numeric(round(spit_depths$depth_m, 2) - 1.61)
 spit_depths$spit <- as.numeric(spit_depths$spit)
 # merge the dates, spits and depths together
 library(data.table)
 dt1<-data.table(spit_depths,  key="depth_m") 
 dt2<-data.table(interpolated_ages, key="depth_m")
 # join
 joined.dt1.dt.2 <- dt1[dt2]
 # remove spits that are NA
 joined.dt1.dt.2 <- joined.dt1.dt.2[!is.na(joined.dt1.dt.2$spit),]

 # merge dates with artefact data
 basic_data$spit <- as.numeric(basic_data$Spit)
 basic_data <- data.table(basic_data, key = "spit")
 setkey(joined.dt1.dt.2, key = "spit")
 basic_data <- basic_data[joined.dt1.dt.2]
 # keep only spits for which we have data
 basic_data <- basic_data[!is.na(basic_data$Spit),]

 # merge with complete flake data
 complete_flakes$spit <- as.numeric(complete_flakes$Spit)
 complete_flakes <- data.table(complete_flakes, key = "spit")
 setkey(joined.dt1.dt.2, key = "spit")
 complete_flakes <- complete_flakes[joined.dt1.dt.2, allow.cartesian=TRUE]
 # keep only spits for which we have data
 complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),]

 #### Let's look at the basic data and make a stacked bar plot

 # subset just the main counts
 basic_data1 <- basic_data[, names(basic_data) %in% c('Spit', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE]
 # change column names to be more general
 setnames(basic_data1,  c("Spit",  "broken.flakes", "complete.flakes",  "cores", "retouched.pieces"))
 # put it in order by spit
 basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit)
 # reshape the data from wide to long
 library(reshape2)
 basic_data_m <- melt(basic_data1, id.var = "Spit")

 # plot of overall discard rates per spit
 library(ggplot2)
 ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + 
  geom_bar( stat="identity") +
  xlab("Excavtion unit") +
  ylab("count") +
  labs(fill = "artefact class") +
  theme_minimal()

 # plot again, this time by age instead of spit
 # subset just the main counts
 basic_data2 <- basic_data[, names(basic_data) %in% c('cal_age_k_BP', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE]
 # change column names to be more general
 setnames(basic_data2, c("broken.flakes", "complete.flakes",  "cores", 
                        "retouched.pieces", "cal_age_k_BP"))
 # put it in order by spit
 #basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit)
 # reshape the data from wide to long
 library(reshape2)
 basic_data_m <- melt(basic_data2, id.var = "cal_age_k_BP")
 # plot
 library(ggplot2)
 ggplot(basic_data_m, aes(cal_age_k_BP, value, fill = factor(variable))) + 
  geom_bar( stat="identity") +
  xlab("x 1000 cal years BP") +
  ylab("count") +
  labs(fill = "artefact class") +
  theme_minimal()


 # Are there any major changes in, say, the discard rates
 # of complete flakes? In addition to visual insepction 
 # of the plot, we can use a statistical method
 # called change point analysis
 # install.packages("ecp")
 library(ecp)
 # let's look at the total number of artefacts per spit
 totals <- aggregate(value ~ cal_age_k_BP, basic_data_m, FUN=sum)
 x1 <- as.matrix(totals)
 class(x1) <- "numeric"
 y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1)
 y$estimates # what row has a significant change? Ignore if it's just the first and last


 # a bayesian approach to change point analysis - just another way
 library(bcp)
 bcp1 <- bcp(x1[,2], w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 5000, return.mcmc = FALSE)
 plot(bcp1)
 legacyplot(bcp1)
 summary(bcp1)
 which(bcp1$posterior.prob > 0.6)
 # pretty flat...


 #### Now let's look at some of the individual flake variables

 ## start with mass

 # make a table of Tukey's five number 
 # summary (minimum, lower-hinge, median, upper-hinge, maximum)
 cf <- do.call(data.frame, aggregate(data = complete_flakes,  Mass..g. ~ cal_age_k_BP, fivenum))
 names(cf) <- c("cal_age_k_BP", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
 # view the table 
 cf
 # the median for most spits is less than 1 g

 # let's make a plot to visualise that table
 # each dot is a flake
 ggplot(complete_flakes, aes(cal_age_k_BP, Mass..g.)) +
  geom_point() +
  ylab("mass (g)") +
  xlab("x 1000 cal years BP") +
  theme_minimal()

 # if we take the log values of the mass we can 
 # see the spread better and detect trends better
 # adding a smoother line helps too
 ggplot(complete_flakes, aes(cal_age_k_BP, log(Mass..g.))) +
  geom_point() +
  ylab("log mass (g)") +
  xlab("x 1000 cal years BP") +
  geom_smooth(aes(group = 1)) +
  theme_minimal()
 # seems like flakes in spits 1-10 
 # are lighter than the others


 # So the exploratory data anlysis has suggested
 # that the upper spits might be bigger than the 
 # lower spits. We can test this with ANOVA if the 
 # varience of each group (Spit) is the same. 
 # Let's check that:
 complete_flakes$Spit <- as.factor(complete_flakes$Spit)
 bartlett.test(Mass..g. ~ Spit, data = complete_flakes)
 # p < 0.05, so varience is not the same, we should not
 # use ANOVA because of its assumptions has been violated

 # Instead we use K-sample permutation test, which is good for unequal variances

 library(coin)
 oneway_test(as.numeric(Mass..g.) ~ Spit, data = complete_flakes, distribution=approximate(B=10000))

 # with a p value greater than 0.05 we have an indication that
 # there are probably no significant non-random differences in 
 # complete flake masses between spits

 # consider spread of complete flake masses by spit
 cf <-  na.omit(aggregate(data = complete_flakes,  Mass..g. ~ factor(Spit), sd))
 names(cf) <- c("Spit", "sd")
 # view the table 
 cf
 # plot it
 ggplot(cf, aes(Spit, sd)) +
  geom_point()
 # pretty random, no reason to believe there are significant changes in variation
 # of flake mass over time

 # Now look into some of the categorical variables...

 ## raw material
 # clean data
 complete_flakes$Raw.Material <- gsub("\\s", "", complete_flakes$Raw.Material)
 # get rid of NAs
 complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),]
 # plot
 ggplot(complete_flakes, aes(x = as.factor(Spit), fill = Raw.Material)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("Spit") +
  ylab("Proportion of spit") +
  scale_fill_discrete(name="Raw material")
 # looks like some interesting pattern with red vs black chert, let's check the 
 # timing of that


 ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = Raw.Material)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("x 1000 years cal BP") +
  ylab("Proportion of Excavation Unit") +
  scale_fill_discrete(name="Raw material")

 # seems like from 4-6 ky BP there was a preference for red chert, and then
 # after that black chert


 ## Amount of cortex per flake
 # clean data
 complete_flakes$cortex <-  as.numeric(gsub("%", "", complete_flakes$X..Cortex..increments.of.10..))

 # plot % cortex per spit
 ggplot(complete_flakes, aes(x = as.factor(Spit), y = cortex)) + 
  geom_point(position = 'jitter') + # spread the overlapping points apart a little
  theme_minimal() + 
  geom_smooth(aes(group = 1)) + 
  xlab("Spit") +
  ylab("Percent of dorsal surface with cortex")

 # plot with log scale
 ggplot(complete_flakes, aes(x = as.factor(Spit), y = log(cortex))) + 
  geom_point(position = 'jitter') + # spread the overlapping points apart a little
  theme_minimal() + 
  # geom_smooth(aes(group = 1)) + 
  xlab("Spit") +
  ylab("Percent of dorsal surface with cortex")

 # box and whisker plot
 ggplot(complete_flakes, aes(x = as.factor(Spit), y = (cortex))) + 
  geom_boxplot() + # spread the overlapping points apart a little
  theme_minimal() + 
  # geom_smooth(aes(group = 1)) + 
  xlab("Spit") +
  ylab("Percent of dorsal surface with cortex")
 # doesn't look like any interesting variation here...

 # plot with years on x-axis
 ggplot(complete_flakes, aes(x = cal_age_k_BP, y = (cortex))) + 
  geom_point(position = 'jitter') + # spread the overlapping points apart a little
  theme_minimal() + 
  # geom_smooth(aes(group = 1)) + 
  xlab("x 1000 years cal BP") +
  ylab("Percent of dorsal surface with cortex")
 # seems like during 1-2 k cal BP there might be a drop in cortex, most flakes
 # have zero % here, but then not many flakes in this period anyway.


 ## Now look at platform types
 # clean and simplify data
 complete_flakes$plattype <-   complete_flakes$Platform.Type..plain..multiple..faceted..focalised..cortical..crushed.
 # check how many types we have
 unique(complete_flakes$plattype)
 # we have multiple entries for plain, multiple, focalised and crushed...
 # make all lower case and remove stray spaces
 complete_flakes$plattype <- gsub("\\s", "", tolower(complete_flakes$plattype))
 # fix typos
 complete_flakes$plattype <- ifelse(complete_flakes$plattype == 'muliple', 
                                   'multiple', 
                                      ifelse(complete_flakes$plattype == 'focalised', 
                                         'focalized', complete_flakes$plattype))
 # remove empty fields
 complete_flakes <- complete_flakes[!(complete_flakes$plattype == ""),]
 # now we've got rid of duplicates we can plot

 # plot
 ggplot(complete_flakes, aes(x = as.factor(Spit), fill = plattype)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("Spit") +
  ylab("Proportion of spit") +
  scale_fill_discrete(name="Platform type")


 ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = plattype)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("x 1000 years cal BP") +
  ylab("Proportion of Excavation Unit") +
  scale_fill_discrete(name="Platform type")

 # not a lot going on

 ## overhang removal
 # clean and simplify data
 complete_flakes$ohr <- complete_flakes$Platform.Preparation..Overhang.
 unique(complete_flakes$ohr)
 complete_flakes$ohr <- tolower(complete_flakes$ohr)
 # remove ambiguous data
 complete_flakes <- complete_flakes[!complete_flakes$ohr =="oh"]

 # plot
 ggplot(complete_flakes, aes(x = as.factor(Spit), fill = ohr)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("Spit") +
  ylab("Proportion of spit") +
  scale_fill_discrete(name="Overhang removal")


 ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = ohr)) + 
  geom_bar(position = "fill") +
  theme_minimal() + 
  xlab("x 1000 years cal BP") +
  ylab("Proportion of Excavation Unit") +
  scale_fill_discrete(name="Overhang removal")

 # very consistently no overhang removal

 # not a lot going on

 ###############################################################################
 # Notes to self

 # spit_depths from F:\My Documents\My UW\Research\1308 Sulawesi\spit sheets\Spit depths from spit sheets.xlsx
 # interpolated_ages from F:\My Documents\My UW\Research\1308 Sulawesi\Dates\Talimbue_interpolated_calibrated_ages.csv
	# R script for plotting of basic lithic data from Gua Mo'o hono, Sulawesi

	#### get data
	# this block will connect to the internet to access our google sheet
	# then download it directly into a dataframe ready to work on
	# If this is the first time, a little preparation is necessary:
	# In google sheet: File -> Publish to web -> publish all sheets;
	# Get link to published data -> CSV, all sheets, copy-paste link here...

	# make a list of the links to all our sheets...
	gid <- 0:7
	goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=",i,"&output=csv"))

	# access data from our links...
	install.packages("RCurl") # only need to do this once per computer
	require(RCurl)
	options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
	myCSVs <- lapply(goog_sheets, function(i) getURL(i))
	data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE))

	# We now have a list object containing our data sheets, let's access
	# each item of the list so we can work with them
	basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit
	complete_flake_mass <- data_sheets[[2]] # and so on...
	broken_flake_mass <- data_sheets[[3]]
	core_mass <- data_sheets[[4]]
	retouch_GIUR <- data_sheets[[5]]
	retouch_dims <- data_sheets[[6]]
	retouch_notches <- data_sheets[[7]]

	##### Let's look at the basic data and make a stacked bar plot

	# subset just the main counts
	basic_data <- na.omit(basic_data[, 2:7])
	# change column names to be more general
	names(basic_data) <- c("Spit", "non.artefacts", "cores", "complete.flakes", "broken.flakes", "debris")
	# reshape the data from wide to long
	library(reshape2)
	basic_data_m <- melt(basic_data, id.var = "Spit")

	# plot
	library(ggplot2)
	ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) +
	geom_bar( stat="identity") +
	xlab("Excavtion unit") +
	ylab("count") +
	labs(fill = "artefact class")

	# let's remove non-artefact and debris
	basic_data_m <- basic_data_m[ !(basic_data_m[["variable"]] %in% c("debris", "non.artefacts")), ]
	# plot just the remaining types
	ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) +
	geom_bar( stat="identity") +
	xlab("Excavtion unit") +
	ylab("count") +
	labs(fill = "artefact class")

	# Are there any major changes in, say, the discard rates
	# of complete flakes? In addition to visual insepction
	# of the plot, we can use a statistical method
	# called change point analysis
	install.packages("ecp")
	library(ecp)
	x1 <- as.matrix(basic_data[["complete.flakes"]])
	y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1)
	y$estimates; y$cluster.number
	# suggests a change in discard rates at spit 14

	# a bayesian approach to change point analysis
	library(bcp)
	bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
	plot(bcp1)
	which(bcp1$posterior.prob > 0.6)
	# indicates a few more spits (16, 18, etc.) where change in discard rates has occured

	# how about cores?
	x1 <- as.matrix(basic_data[["cores"]])
	bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
	plot(bcp1)
	which(bcp1$posterior.prob > 0.8)
	# yes about spit 18

	#### Now let's look at the mass data
	# let's explore the complete flakes...

	# make a table of Tukey's five number
	# summary (minimum, lower-hinge, median, upper-hinge, maximum)
	cf <- do.call(data.frame, aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), fivenum))
	names(cf) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
	# view the table
	cf
	# the median for most spits is less than 1 g

	# let's plot to visualise that table
	# each dot is a flake
	ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
	geom_point() +
	ylab("mass (g)") +
	xlab("Spit")

	# if we take the log values of the mass we can
	# see the spread better and detect trends better
	# adding a smoother line helps too
	ggplot(complete_flake_mass, aes(factor(Spit), log(CFLA..g.))) +
	geom_point() +
	ylab("mass (g)") +
	xlab("Spit") +
	geom_smooth(aes(group = 1))
	# seems like flakes in spits 1-10
	# are heavier than the others

	# plot summary stats of those data
	# we'll trim the y-axis to minimise the effect
	# of outliers
	ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
	geom_boxplot() + # geom_violin() is nice too
	ylab("mass (g)") +
	xlab("Spit") +
	ylim(0,2) # this adjusts the y-axis limits
	# looks like spits 6-9 are bigger than
	# other spits

	# So the exploratory data anlysis has suggested
	# that the upper spits might be bigger than the
	# lower spits. We can test this with ANOVA if the
	# varience of each group (Spit) is the same.
	# Let's check that:
	complete_flake_mass$Spit <- as.factor(complete_flake_mass$Spit)
	bartlett.test(CFLA..g. ~ Spit, data = complete_flake_mass[-(1:4),])
	# p < 0.05, so varience is not the same, we should not
	# use ANOVA because of its assumptions has been violated

	# Instead we use K-sample permutation test, which is good for unequal variances

	library(coin)
	oneway_test(as.numeric(CFLA..g.) ~ Spit, data = complete_flake_mass, distribution=approximate(B=10000))

	# with a p value greater than 0.05 we have an indication that
	# there are probably no significant non-random differences in
	# complete flake masses between spits


	# consider spread of complete flake masses by spit
	cf <- na.omit(aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), sd))
	names(cf) <- c("Spit", "sd")
	# view the table
	cf
	# plot it
	ggplot(cf, aes(Spit, sd)) +
	geom_point()
	# seems like spit 19 has a very high spread of masses relative to the others



	##### repeat for core flake mass

	# make a table of Tukey's five number
	# summary (minimum, lower-hinge, median, upper-hinge, maximum)
	co <- do.call(data.frame, aggregate(data = core_mass, Core..g. ~ factor(Spit), fivenum))
	names(co) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
	# view the table
	co
	# the median for most spits is ~3 g

	# let's plot to visualise that table
	# each dot is a flake
	ggplot(core_mass, aes(factor(Spit), Core..g.)) +
	geom_point() +
	ylab("mass (g)") +
	xlab("Spit")

	# if we take the log values of the mass we can
	# see the spread better and detect trends better
	# adding a smoother line helps too
	ggplot(core_mass, aes(factor(Spit), log(Core..g.))) +
	geom_point() +
	ylab("mass (g)") +
	xlab("Spit") +
	geom_smooth(aes(group = 1))
	# seems like flakes in spits 27-28
	# are heavier than the others

	# plot summary stats of those data
	# we'll trim the y-axis to minimise the effect
	# of outliers
	ggplot(core_mass, aes(factor(Spit), Core..g.)) +
	geom_boxplot() + # geom_violin() is nice too
	ylab("mass (g)") +
	xlab("Spit") +
	ylim(0,5) # this adjusts the y-axis limits


	# So the exploratory data anlysis has suggested
	# that the upper spits might be bigger than the
	# lower spits. We can test this with ANOVA if the
	# varience of each group (Spit) is the same.
	# Let's check that:
	core_mass$Spit <- as.factor(core_mass$Spit)
	bartlett.test(Core..g. ~ Spit, data = core_mass[-c(1:8,25),])
	# p < 0.05, so varience is not the same, we should not
	# use ANOVA because of its assumptions has been violated

	# Instead we use K-sample permutation test, which is good for unequal variances

	library(coin)
	oneway_test(as.numeric(Core..g.) ~ Spit, data = core_mass, distribution=approximate(B=10000))

	# with a p value greater than 0.05 we have an indication that
	# there are probably no significant non-random differences in
	# complete flake masses between spits



	# consider spread of complete flake masses by spit
	co <- na.omit(aggregate(data = core_mass, Core..g. ~ factor(Spit), sd))
	names(co) <- c("Spit", "sd")
	# view the table
	co
	# plot it
	ggplot(co, aes(Spit, sd)) +
	geom_point()
	# seems like spits 9 and 29 have a very high spread of masses relative to the others


	# investigate changes in ratios of complete flakes and broken flakes
	ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.broken.flakes))
	ratio <- data.frame(Spit = basic_data$Spit[1:32], ratio = ratio)
	library(ggplot2)
	ggplot(ratio, aes(Spit, ratio)) +
	geom_point() +
	geom_smooth() +
	ylab("ratio of complete flakes to broken flakes")
	# correlation
	with(ratio, cor.test(Spit, ratio))

	# investigate changes in ratios of complete flakes and cores
	ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.cores))
	ratio <- data.frame(Spit = basic_data$Spit[1:31], ratio = ratio)
	library(ggplot2)
	ggplot(ratio, aes(Spit, ratio)) +
	geom_bar(stat="identity") +
	geom_smooth() +
	ylab("ratio of complete flakes to cores")
	# correlation
	with(ratio, cor.test(Spit, ratio))

	##################################################################
	#
	# GIUR data, reformatted...
	require(RCurl)
	options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
	GIUR <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=6&output=csv"
	GIUR <- read.csv(textConnection(getURL(GIUR)), stringsAsFactors = FALSE)

	# just get newly formatted GIUR data
	GIURd <- GIUR[1:22,-1]
	# extract T and t
	Tval <- GIURd[seq(1, length(GIURd), by=2)]
	tval <- GIURd[seq(2, length(GIURd), by=2)]

	# give matching row names (no necessary)
	# names(Tval) <- as.numeric(unlist(regmatches(names(Tval), gregexpr('\\(?[0-9,.]+', names(Tval) ))))
	# names(tval) <- as.numeric(unlist(regmatches(names(tval), gregexpr('\\(?[0-9,.]+', names(tval) ))))

	# calculate GIUR per zone, per artefact
	# see here for details: http://www.sciencedirect.com.offcampus.lib.washington.edu/science/article/pii/S0305440309000284
	GIURz <- sapply(1:10, function(i) as.numeric(tval[,i]) / as.numeric(Tval[,i]) )

	# calculate total GIUR for each artefact
	GIURz[is.na(GIURz)] <- 0 # replace NAs with zeros
	# look at distribution across zones
	library(ggplot2); library(reshape2)
	mlt <- melt(GIURz)
	ggplot(mlt, aes(Var2, value))+
	geom_bar(stat="identity")
	# zones 4 and 5 get the most retouching...

	# identical to above, sanity check
	mlt_agg <- aggregate(value ~ Var2, mlt[,2:3], sum)
	ggplot(mlt_agg, aes(Var2, value))+
	geom_bar(stat="identity")

	# summarise by spit
	GIURt <- rowSums(GIURz)/10
	# get spit numbers back in there
	spit <- as.numeric(unlist(regmatches(GIUR[1:22,1], gregexpr('\\(?[0-9,.]+', GIUR[1:22,1] ))))
	GIURT <- data.frame(spit = spit, GIUR = GIURt)
	# plot
	library(ggplot2)
	ggplot(GIURT, aes(spit, GIUR)) +
	geom_point() +
	geom_smooth() +
	ylim(0,1)
	# another kind of plot...
	ggplot(GIURT, aes(as.factor(spit), GIUR)) +
	geom_boxplot() +
	ylim(0,1)

	# now get average GIUR per spit
	GIURav <- aggregate(GIUR ~ spit, GIURT, mean)
	# plot
	ggplot(GIURav, aes(spit, GIUR)) +
	geom_point() +
	geom_smooth() +
	ylim(0,1)

	#############################################################
	# metrics of retouched flakes
	require(RCurl)
	options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
	met <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=5&output=csv"
	met <- read.csv(textConnection(getURL(met)), stringsAsFactors = FALSE)
	met[is.na(met)] <- 0 # replace NAs with zeros
	met <- met[,1:5]; met[met=="n/a" \| met==""] <- 0; met$Max..Dimension.mm. <- as.numeric(met$Max..Dimension.mm.)
	library(data.table)
	met <- data.table(met)

	# EL = length + maximum width + maximum dimension
	# EL/M, M = mass in g, we'll call it EL_M because slashes are special characters in R
	# http://www.academia.edu/235080/A_method_for_estimating_edge_length_from_flake_dimensions_use_and_implications_for_technological_change_in_the_MSA
	met$EL <- with(met, Length..mm. + Max..Dimension.mm. + Max..Width..mm.)
	met$EL_M <- with(met, EL / Mass..g.)

	# average mass, length, etc per spit
	met_av <- met[,lapply(.SD,mean),by="SPIT"]

	# look for correlations between GIUR and size
	merg <- merge( GIURav, met_av, by.x = "spit", by.y = "SPIT")
	with(merg, cor.test( GIUR, Mass..g.))
	with(merg, cor.test( GIUR, EL_M))

	# look for correlation between GIUR and discard rates
	cf <- na.omit(basic_data[,c("Spit", "Number.of.complete.flakes")])
	mergcf <- merge( merg, cf, by.x = "spit", by.y = "Spit")
	with(mergcf, cor.test( GIUR, Number.of.complete.flakes))

	# Correlation is the most commonly
	# used statistical technique to investigate
	# any kind of data. A correlation is a
	# single number that describes the degree
	# of relationship between two variables,
	# which we can label X and Y, just for
	# example. Correlations can suggest possible
	# causal relationships but the statistical
	# test alone is not sufficient to demonstrate
	# the relationship, you also need explain it
	# with some logic and details of the likely
	# causal connection. The answer to the
	# question of how much are the two variables
	# correlated comes from two numbers:
	#
	# r, or the coefficient of correlation. r
	# takes on a value in the range from -1 to 1.
	# This measure will show whether the correlation
	# between variables is strong (close to 1 or -1),
	# weak or non-existent (zero or nearly zero)
	# and indicate the direction of this correlation.
	# A positive value for r means that larger
	# X values go along with larger Y values while
	# a negative r value means when X values get
	# larger, the Y values get smaller.
	# # p or the probability value. This value
	# p tells you the probability of the r-value
	# in a collection of random data in which you'd
	# have no reason to suspect any relationship. A
	# p-value of 0.05 indicates a 5% chance that
	# results you are seeing would have come up in
	# a random dataset, so you can say with a 95%
	# probability of being correct that you are
	# observing a real relationship in your data.
	# Scholarly convention tends to hold p values
	# of less than 0.05 as indicators of a
	# statistically signficant relationship. Above
	# 0.05 is not significant.
	#
	# Note that the size of the p-value in your
	# correlation analysis says nothing about the
	# strength of the relationship between your
	# two variables - it is possible to have a
	# highly significant result (very small p-value,
	# eg. p = 0.001) for a miniscule effect (ie.
	# an r value close to zero) that might not
	# be of any real-world importance or archaeological
	# meaning.
	#
	# You need to use your common sense
	# when interpreting your correlation data; don't
	# just mindlessly copy and paste them and get
	# excited when r is big or p-values
	# are small. Ensure that you're only
	# getting excited about data that make
	# sense in the real world.
	#
	# We will calculate the correlation coeffient
	# to show on the plot and get the probability
	# value of the null hypothesis (that the
	# correlation is due to random
	# processes that we don't care about). This value will
	# help us test the hypothesis that
	# the overall slope of the linear
	# regression in 0 (ie. there is no
	# relationship between the two variables).
	# A line with slope 0 is horizontal,
	# which means that Y does not depend on X at all.

	#
	# Remember that a correlation is considered to be interesting
	# only when the p value is less than 0.05. If it's greater
	# than 0.05, then the correlation is most likely due to
	# chance and not suggestive of anything interesting.
	#
	# see for something to cite in your written work:
	# http://uwashington.worldcat.org.offcampus.lib.washington.edu/oclc/701053523