Last active
December 17, 2015 23:59
-
-
Save benmarwick/5693206 to your computer and use it in GitHub Desktop.
ARCHY 299/499 lithics lab: snippets of R code for basic exploration of our lithic data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R script for plotting of basic lithic data from Gua Mo'o hono, Sulawesi | |
#### get data | |
# this block will connect to the internet to access our google sheet | |
# then download it directly into a dataframe ready to work on | |
# If this is the first time, a little preparation is necessary: | |
# In google sheet: File -> Publish to web -> publish all sheets; | |
# Get link to published data -> CSV, all sheets, copy-paste link here... | |
# make a list of the links to all our sheets... | |
gid <- 0:7 | |
goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=",i,"&output=csv")) | |
# access data from our links... | |
install.packages("RCurl") # only need to do this once per computer | |
require(RCurl) | |
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE)) | |
myCSVs <- lapply(goog_sheets, function(i) getURL(i)) | |
data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE)) | |
# We now have a list object containing our data sheets, let's access | |
# each item of the list so we can work with them | |
basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit | |
complete_flake_mass <- data_sheets[[2]] # and so on... | |
broken_flake_mass <- data_sheets[[3]] | |
core_mass <- data_sheets[[4]] | |
retouch_GIUR <- data_sheets[[5]] | |
retouch_dims <- data_sheets[[6]] | |
retouch_notches <- data_sheets[[7]] | |
##### Let's look at the basic data and make a stacked bar plot | |
# subset just the main counts | |
basic_data <- na.omit(basic_data[, 2:7]) | |
# change column names to be more general | |
names(basic_data) <- c("Spit", "non.artefacts", "cores", "complete.flakes", "broken.flakes", "debris") | |
# reshape the data from wide to long | |
library(reshape2) | |
basic_data_m <- melt(basic_data, id.var = "Spit") | |
# plot | |
library(ggplot2) | |
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + | |
geom_bar( stat="identity") + | |
xlab("Excavtion unit") + | |
ylab("count") + | |
labs(fill = "artefact class") | |
# let's remove non-artefact and debris | |
basic_data_m <- basic_data_m[ !(basic_data_m[["variable"]] %in% c("debris", "non.artefacts")), ] | |
# plot just the remaining types | |
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + | |
geom_bar( stat="identity") + | |
xlab("Excavtion unit") + | |
ylab("count") + | |
labs(fill = "artefact class") | |
# Are there any major changes in, say, the discard rates | |
# of complete flakes? In addition to visual insepction | |
# of the plot, we can use a statistical method | |
# called change point analysis | |
install.packages("ecp") | |
library(ecp) | |
x1 <- as.matrix(basic_data[["complete.flakes"]]) | |
y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1) | |
y$estimates; y$cluster.number | |
# suggests a change in discard rates at spit 14 | |
# a bayesian approach to change point analysis | |
library(bcp) | |
bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE) | |
plot(bcp1) | |
which(bcp1$posterior.prob > 0.6) | |
# indicates a few more spits (16, 18, etc.) where change in discard rates has occured | |
# how about cores? | |
x1 <- as.matrix(basic_data[["cores"]]) | |
bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE) | |
plot(bcp1) | |
which(bcp1$posterior.prob > 0.8) | |
# yes about spit 18 | |
#### Now let's look at the mass data | |
# let's explore the complete flakes... | |
# make a table of Tukey's five number | |
# summary (minimum, lower-hinge, median, upper-hinge, maximum) | |
cf <- do.call(data.frame, aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), fivenum)) | |
names(cf) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum") | |
# view the table | |
cf | |
# the median for most spits is less than 1 g | |
# let's plot to visualise that table | |
# each dot is a flake | |
ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) + | |
geom_point() + | |
ylab("mass (g)") + | |
xlab("Spit") | |
# if we take the log values of the mass we can | |
# see the spread better and detect trends better | |
# adding a smoother line helps too | |
ggplot(complete_flake_mass, aes(factor(Spit), log(CFLA..g.))) + | |
geom_point() + | |
ylab("mass (g)") + | |
xlab("Spit") + | |
geom_smooth(aes(group = 1)) | |
# seems like flakes in spits 1-10 | |
# are heavier than the others | |
# plot summary stats of those data | |
# we'll trim the y-axis to minimise the effect | |
# of outliers | |
ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) + | |
geom_boxplot() + # geom_violin() is nice too | |
ylab("mass (g)") + | |
xlab("Spit") + | |
ylim(0,2) # this adjusts the y-axis limits | |
# looks like spits 6-9 are bigger than | |
# other spits | |
# So the exploratory data anlysis has suggested | |
# that the upper spits might be bigger than the | |
# lower spits. We can test this with ANOVA if the | |
# varience of each group (Spit) is the same. | |
# Let's check that: | |
complete_flake_mass$Spit <- as.factor(complete_flake_mass$Spit) | |
bartlett.test(CFLA..g. ~ Spit, data = complete_flake_mass[-(1:4),]) | |
# p < 0.05, so varience is not the same, we should not | |
# use ANOVA because of its assumptions has been violated | |
# Instead we use K-sample permutation test, which is good for unequal variances | |
library(coin) | |
oneway_test(as.numeric(CFLA..g.) ~ Spit, data = complete_flake_mass, distribution=approximate(B=10000)) | |
# with a p value greater than 0.05 we have an indication that | |
# there are probably no significant non-random differences in | |
# complete flake masses between spits | |
# consider spread of complete flake masses by spit | |
cf <- na.omit(aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), sd)) | |
names(cf) <- c("Spit", "sd") | |
# view the table | |
cf | |
# plot it | |
ggplot(cf, aes(Spit, sd)) + | |
geom_point() | |
# seems like spit 19 has a very high spread of masses relative to the others | |
##### repeat for core flake mass | |
# make a table of Tukey's five number | |
# summary (minimum, lower-hinge, median, upper-hinge, maximum) | |
co <- do.call(data.frame, aggregate(data = core_mass, Core..g. ~ factor(Spit), fivenum)) | |
names(co) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum") | |
# view the table | |
co | |
# the median for most spits is ~3 g | |
# let's plot to visualise that table | |
# each dot is a flake | |
ggplot(core_mass, aes(factor(Spit), Core..g.)) + | |
geom_point() + | |
ylab("mass (g)") + | |
xlab("Spit") | |
# if we take the log values of the mass we can | |
# see the spread better and detect trends better | |
# adding a smoother line helps too | |
ggplot(core_mass, aes(factor(Spit), log(Core..g.))) + | |
geom_point() + | |
ylab("mass (g)") + | |
xlab("Spit") + | |
geom_smooth(aes(group = 1)) | |
# seems like flakes in spits 27-28 | |
# are heavier than the others | |
# plot summary stats of those data | |
# we'll trim the y-axis to minimise the effect | |
# of outliers | |
ggplot(core_mass, aes(factor(Spit), Core..g.)) + | |
geom_boxplot() + # geom_violin() is nice too | |
ylab("mass (g)") + | |
xlab("Spit") + | |
ylim(0,5) # this adjusts the y-axis limits | |
# So the exploratory data anlysis has suggested | |
# that the upper spits might be bigger than the | |
# lower spits. We can test this with ANOVA if the | |
# varience of each group (Spit) is the same. | |
# Let's check that: | |
core_mass$Spit <- as.factor(core_mass$Spit) | |
bartlett.test(Core..g. ~ Spit, data = core_mass[-c(1:8,25),]) | |
# p < 0.05, so varience is not the same, we should not | |
# use ANOVA because of its assumptions has been violated | |
# Instead we use K-sample permutation test, which is good for unequal variances | |
library(coin) | |
oneway_test(as.numeric(Core..g.) ~ Spit, data = core_mass, distribution=approximate(B=10000)) | |
# with a p value greater than 0.05 we have an indication that | |
# there are probably no significant non-random differences in | |
# complete flake masses between spits | |
# consider spread of complete flake masses by spit | |
co <- na.omit(aggregate(data = core_mass, Core..g. ~ factor(Spit), sd)) | |
names(co) <- c("Spit", "sd") | |
# view the table | |
co | |
# plot it | |
ggplot(co, aes(Spit, sd)) + | |
geom_point() | |
# seems like spits 9 and 29 have a very high spread of masses relative to the others | |
# investigate changes in ratios of complete flakes and broken flakes | |
ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.broken.flakes)) | |
ratio <- data.frame(Spit = basic_data$Spit[1:32], ratio = ratio) | |
library(ggplot2) | |
ggplot(ratio, aes(Spit, ratio)) + | |
geom_point() + | |
geom_smooth() + | |
ylab("ratio of complete flakes to broken flakes") | |
# correlation | |
with(ratio, cor.test(Spit, ratio)) | |
# investigate changes in ratios of complete flakes and cores | |
ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.cores)) | |
ratio <- data.frame(Spit = basic_data$Spit[1:31], ratio = ratio) | |
library(ggplot2) | |
ggplot(ratio, aes(Spit, ratio)) + | |
geom_bar(stat="identity") + | |
geom_smooth() + | |
ylab("ratio of complete flakes to cores") | |
# correlation | |
with(ratio, cor.test(Spit, ratio)) | |
################################################################## | |
# | |
# GIUR data, reformatted... | |
require(RCurl) | |
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE)) | |
GIUR <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=6&output=csv" | |
GIUR <- read.csv(textConnection(getURL(GIUR)), stringsAsFactors = FALSE) | |
# just get newly formatted GIUR data | |
GIURd <- GIUR[1:22,-1] | |
# extract T and t | |
Tval <- GIURd[seq(1, length(GIURd), by=2)] | |
tval <- GIURd[seq(2, length(GIURd), by=2)] | |
# give matching row names (no necessary) | |
# names(Tval) <- as.numeric(unlist(regmatches(names(Tval), gregexpr('\\(?[0-9,.]+', names(Tval) )))) | |
# names(tval) <- as.numeric(unlist(regmatches(names(tval), gregexpr('\\(?[0-9,.]+', names(tval) )))) | |
# calculate GIUR per zone, per artefact | |
# see here for details: http://www.sciencedirect.com.offcampus.lib.washington.edu/science/article/pii/S0305440309000284 | |
GIURz <- sapply(1:10, function(i) as.numeric(tval[,i]) / as.numeric(Tval[,i]) ) | |
# calculate total GIUR for each artefact | |
GIURz[is.na(GIURz)] <- 0 # replace NAs with zeros | |
# look at distribution across zones | |
library(ggplot2); library(reshape2) | |
mlt <- melt(GIURz) | |
ggplot(mlt, aes(Var2, value))+ | |
geom_bar(stat="identity") | |
# zones 4 and 5 get the most retouching... | |
# identical to above, sanity check | |
mlt_agg <- aggregate(value ~ Var2, mlt[,2:3], sum) | |
ggplot(mlt_agg, aes(Var2, value))+ | |
geom_bar(stat="identity") | |
# summarise by spit | |
GIURt <- rowSums(GIURz)/10 | |
# get spit numbers back in there | |
spit <- as.numeric(unlist(regmatches(GIUR[1:22,1], gregexpr('\\(?[0-9,.]+', GIUR[1:22,1] )))) | |
GIURT <- data.frame(spit = spit, GIUR = GIURt) | |
# plot | |
library(ggplot2) | |
ggplot(GIURT, aes(spit, GIUR)) + | |
geom_point() + | |
geom_smooth() + | |
ylim(0,1) | |
# another kind of plot... | |
ggplot(GIURT, aes(as.factor(spit), GIUR)) + | |
geom_boxplot() + | |
ylim(0,1) | |
# now get average GIUR per spit | |
GIURav <- aggregate(GIUR ~ spit, GIURT, mean) | |
# plot | |
ggplot(GIURav, aes(spit, GIUR)) + | |
geom_point() + | |
geom_smooth() + | |
ylim(0,1) | |
############################################################# | |
# metrics of retouched flakes | |
require(RCurl) | |
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE)) | |
met <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=5&output=csv" | |
met <- read.csv(textConnection(getURL(met)), stringsAsFactors = FALSE) | |
met[is.na(met)] <- 0 # replace NAs with zeros | |
met <- met[,1:5]; met[met=="n/a" | met==""] <- 0; met$Max..Dimension.mm. <- as.numeric(met$Max..Dimension.mm.) | |
library(data.table) | |
met <- data.table(met) | |
# EL = length + maximum width + maximum dimension | |
# EL/M, M = mass in g, we'll call it EL_M because slashes are special characters in R | |
# http://www.academia.edu/235080/A_method_for_estimating_edge_length_from_flake_dimensions_use_and_implications_for_technological_change_in_the_MSA | |
met$EL <- with(met, Length..mm. + Max..Dimension.mm. + Max..Width..mm.) | |
met$EL_M <- with(met, EL / Mass..g.) | |
# average mass, length, etc per spit | |
met_av <- met[,lapply(.SD,mean),by="SPIT"] | |
# look for correlations between GIUR and size | |
merg <- merge( GIURav, met_av, by.x = "spit", by.y = "SPIT") | |
with(merg, cor.test( GIUR, Mass..g.)) | |
with(merg, cor.test( GIUR, EL_M)) | |
# look for correlation between GIUR and discard rates | |
cf <- na.omit(basic_data[,c("Spit", "Number.of.complete.flakes")]) | |
mergcf <- merge( merg, cf, by.x = "spit", by.y = "Spit") | |
with(mergcf, cor.test( GIUR, Number.of.complete.flakes)) | |
# Correlation is the most commonly | |
# used statistical technique to investigate | |
# any kind of data. A correlation is a | |
# single number that describes the degree | |
# of relationship between two variables, | |
# which we can label X and Y, just for | |
# example. Correlations can suggest possible | |
# causal relationships but the statistical | |
# test alone is not sufficient to demonstrate | |
# the relationship, you also need explain it | |
# with some logic and details of the likely | |
# causal connection. The answer to the | |
# question of how much are the two variables | |
# correlated comes from two numbers: | |
# | |
# r, or the coefficient of correlation. r | |
# takes on a value in the range from -1 to 1. | |
# This measure will show whether the correlation | |
# between variables is strong (close to 1 or -1), | |
# weak or non-existent (zero or nearly zero) | |
# and indicate the direction of this correlation. | |
# A positive value for r means that larger | |
# X values go along with larger Y values while | |
# a negative r value means when X values get | |
# larger, the Y values get smaller. | |
# # p or the probability value. This value | |
# p tells you the probability of the r-value | |
# in a collection of random data in which you'd | |
# have no reason to suspect any relationship. A | |
# p-value of 0.05 indicates a 5% chance that | |
# results you are seeing would have come up in | |
# a random dataset, so you can say with a 95% | |
# probability of being correct that you are | |
# observing a real relationship in your data. | |
# Scholarly convention tends to hold p values | |
# of less than 0.05 as indicators of a | |
# statistically signficant relationship. Above | |
# 0.05 is not significant. | |
# | |
# Note that the size of the p-value in your | |
# correlation analysis says nothing about the | |
# strength of the relationship between your | |
# two variables - it is possible to have a | |
# highly significant result (very small p-value, | |
# eg. p = 0.001) for a miniscule effect (ie. | |
# an r value close to zero) that might not | |
# be of any real-world importance or archaeological | |
# meaning. | |
# | |
# You need to use your common sense | |
# when interpreting your correlation data; don't | |
# just mindlessly copy and paste them and get | |
# excited when r is big or p-values | |
# are small. Ensure that you're only | |
# getting excited about data that make | |
# sense in the real world. | |
# | |
# We will calculate the correlation coeffient | |
# to show on the plot and get the probability | |
# value of the null hypothesis (that the | |
# correlation is due to random | |
# processes that we don't care about). This value will | |
# help us test the hypothesis that | |
# the overall slope of the linear | |
# regression in 0 (ie. there is no | |
# relationship between the two variables). | |
# A line with slope 0 is horizontal, | |
# which means that Y does not depend on X at all. | |
# | |
# Remember that a correlation is considered to be interesting | |
# only when the p value is less than 0.05. If it's greater | |
# than 0.05, then the correlation is most likely due to | |
# chance and not suggestive of anything interesting. | |
# | |
# see for something to cite in your written work: | |
# http://uwashington.worldcat.org.offcampus.lib.washington.edu/oclc/701053523 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R script for plotting of basic lithic data from Talimbue, Sulawesi | |
#### get data | |
# this block will connect to the internet to access our google sheet | |
# then download it directly into a dataframe ready to work on | |
# If this is the first time, a little preparation is necessary: | |
# In google sheet: File -> Publish to web -> publish all sheets; | |
# Get link to published data -> CSV, all sheets, copy-paste link here... | |
# make a list of the links to all our sheets... | |
gid <- c(0,3,6,7) | |
goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldFFfcXYtNURINWtlOGtFbFhwX3hXTGc&single=true&gid=",i,"&output=csv")) | |
# access data from our links... | |
# install.packages("RCurl") # only need to do this once per computer | |
require(RCurl) | |
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE)) | |
myCSVs <- lapply(goog_sheets, function(i) getURL(i)) | |
data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE)) | |
# We now have a list object containing our data sheets, let's access | |
# each item of the list so we can work with them | |
basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit | |
complete_flakes <- data_sheets[[2]] # and so on... | |
spit_depths <- data_sheets[[3]] | |
interpolated_ages <- data.frame(lapply(data_sheets[[4]], as.numeric)) | |
#### Before we get into the lithic data, let's look at the radiocarbon dates | |
# so we can determine the ages of interesting changes in the lithic sequence | |
# first run all the lines at the end of this file in the 'end matter' | |
# to load the depth-age data | |
# round depths to ensure a match with depths in the other table | |
spit_depths$depth_m <- as.numeric(round(spit_depths$depth_m, 2) - 1.61) | |
spit_depths$spit <- as.numeric(spit_depths$spit) | |
# merge the dates, spits and depths together | |
library(data.table) | |
dt1<-data.table(spit_depths, key="depth_m") | |
dt2<-data.table(interpolated_ages, key="depth_m") | |
# join | |
joined.dt1.dt.2 <- dt1[dt2] | |
# remove spits that are NA | |
joined.dt1.dt.2 <- joined.dt1.dt.2[!is.na(joined.dt1.dt.2$spit),] | |
# merge dates with artefact data | |
basic_data$spit <- as.numeric(basic_data$Spit) | |
basic_data <- data.table(basic_data, key = "spit") | |
setkey(joined.dt1.dt.2, key = "spit") | |
basic_data <- basic_data[joined.dt1.dt.2] | |
# keep only spits for which we have data | |
basic_data <- basic_data[!is.na(basic_data$Spit),] | |
# merge with complete flake data | |
complete_flakes$spit <- as.numeric(complete_flakes$Spit) | |
complete_flakes <- data.table(complete_flakes, key = "spit") | |
setkey(joined.dt1.dt.2, key = "spit") | |
complete_flakes <- complete_flakes[joined.dt1.dt.2, allow.cartesian=TRUE] | |
# keep only spits for which we have data | |
complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),] | |
#### Let's look at the basic data and make a stacked bar plot | |
# subset just the main counts | |
basic_data1 <- basic_data[, names(basic_data) %in% c('Spit', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE] | |
# change column names to be more general | |
setnames(basic_data1, c("Spit", "broken.flakes", "complete.flakes", "cores", "retouched.pieces")) | |
# put it in order by spit | |
basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit) | |
# reshape the data from wide to long | |
library(reshape2) | |
basic_data_m <- melt(basic_data1, id.var = "Spit") | |
# plot of overall discard rates per spit | |
library(ggplot2) | |
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) + | |
geom_bar( stat="identity") + | |
xlab("Excavtion unit") + | |
ylab("count") + | |
labs(fill = "artefact class") + | |
theme_minimal() | |
# plot again, this time by age instead of spit | |
# subset just the main counts | |
basic_data2 <- basic_data[, names(basic_data) %in% c('cal_age_k_BP', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE] | |
# change column names to be more general | |
setnames(basic_data2, c("broken.flakes", "complete.flakes", "cores", | |
"retouched.pieces", "cal_age_k_BP")) | |
# put it in order by spit | |
#basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit) | |
# reshape the data from wide to long | |
library(reshape2) | |
basic_data_m <- melt(basic_data2, id.var = "cal_age_k_BP") | |
# plot | |
library(ggplot2) | |
ggplot(basic_data_m, aes(cal_age_k_BP, value, fill = factor(variable))) + | |
geom_bar( stat="identity") + | |
xlab("x 1000 cal years BP") + | |
ylab("count") + | |
labs(fill = "artefact class") + | |
theme_minimal() | |
# Are there any major changes in, say, the discard rates | |
# of complete flakes? In addition to visual insepction | |
# of the plot, we can use a statistical method | |
# called change point analysis | |
# install.packages("ecp") | |
library(ecp) | |
# let's look at the total number of artefacts per spit | |
totals <- aggregate(value ~ cal_age_k_BP, basic_data_m, FUN=sum) | |
x1 <- as.matrix(totals) | |
class(x1) <- "numeric" | |
y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1) | |
y$estimates # what row has a significant change? Ignore if it's just the first and last | |
# a bayesian approach to change point analysis - just another way | |
library(bcp) | |
bcp1 <- bcp(x1[,2], w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 5000, return.mcmc = FALSE) | |
plot(bcp1) | |
legacyplot(bcp1) | |
summary(bcp1) | |
which(bcp1$posterior.prob > 0.6) | |
# pretty flat... | |
#### Now let's look at some of the individual flake variables | |
## start with mass | |
# make a table of Tukey's five number | |
# summary (minimum, lower-hinge, median, upper-hinge, maximum) | |
cf <- do.call(data.frame, aggregate(data = complete_flakes, Mass..g. ~ cal_age_k_BP, fivenum)) | |
names(cf) <- c("cal_age_k_BP", "minimum", "lower-hinge", "median", "upper-hinge", "maximum") | |
# view the table | |
cf | |
# the median for most spits is less than 1 g | |
# let's make a plot to visualise that table | |
# each dot is a flake | |
ggplot(complete_flakes, aes(cal_age_k_BP, Mass..g.)) + | |
geom_point() + | |
ylab("mass (g)") + | |
xlab("x 1000 cal years BP") + | |
theme_minimal() | |
# if we take the log values of the mass we can | |
# see the spread better and detect trends better | |
# adding a smoother line helps too | |
ggplot(complete_flakes, aes(cal_age_k_BP, log(Mass..g.))) + | |
geom_point() + | |
ylab("log mass (g)") + | |
xlab("x 1000 cal years BP") + | |
geom_smooth(aes(group = 1)) + | |
theme_minimal() | |
# seems like flakes in spits 1-10 | |
# are lighter than the others | |
# So the exploratory data anlysis has suggested | |
# that the upper spits might be bigger than the | |
# lower spits. We can test this with ANOVA if the | |
# varience of each group (Spit) is the same. | |
# Let's check that: | |
complete_flakes$Spit <- as.factor(complete_flakes$Spit) | |
bartlett.test(Mass..g. ~ Spit, data = complete_flakes) | |
# p < 0.05, so varience is not the same, we should not | |
# use ANOVA because of its assumptions has been violated | |
# Instead we use K-sample permutation test, which is good for unequal variances | |
library(coin) | |
oneway_test(as.numeric(Mass..g.) ~ Spit, data = complete_flakes, distribution=approximate(B=10000)) | |
# with a p value greater than 0.05 we have an indication that | |
# there are probably no significant non-random differences in | |
# complete flake masses between spits | |
# consider spread of complete flake masses by spit | |
cf <- na.omit(aggregate(data = complete_flakes, Mass..g. ~ factor(Spit), sd)) | |
names(cf) <- c("Spit", "sd") | |
# view the table | |
cf | |
# plot it | |
ggplot(cf, aes(Spit, sd)) + | |
geom_point() | |
# pretty random, no reason to believe there are significant changes in variation | |
# of flake mass over time | |
# Now look into some of the categorical variables... | |
## raw material | |
# clean data | |
complete_flakes$Raw.Material <- gsub("\\s", "", complete_flakes$Raw.Material) | |
# get rid of NAs | |
complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),] | |
# plot | |
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = Raw.Material)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("Spit") + | |
ylab("Proportion of spit") + | |
scale_fill_discrete(name="Raw material") | |
# looks like some interesting pattern with red vs black chert, let's check the | |
# timing of that | |
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = Raw.Material)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("x 1000 years cal BP") + | |
ylab("Proportion of Excavation Unit") + | |
scale_fill_discrete(name="Raw material") | |
# seems like from 4-6 ky BP there was a preference for red chert, and then | |
# after that black chert | |
## Amount of cortex per flake | |
# clean data | |
complete_flakes$cortex <- as.numeric(gsub("%", "", complete_flakes$X..Cortex..increments.of.10..)) | |
# plot % cortex per spit | |
ggplot(complete_flakes, aes(x = as.factor(Spit), y = cortex)) + | |
geom_point(position = 'jitter') + # spread the overlapping points apart a little | |
theme_minimal() + | |
geom_smooth(aes(group = 1)) + | |
xlab("Spit") + | |
ylab("Percent of dorsal surface with cortex") | |
# plot with log scale | |
ggplot(complete_flakes, aes(x = as.factor(Spit), y = log(cortex))) + | |
geom_point(position = 'jitter') + # spread the overlapping points apart a little | |
theme_minimal() + | |
# geom_smooth(aes(group = 1)) + | |
xlab("Spit") + | |
ylab("Percent of dorsal surface with cortex") | |
# box and whisker plot | |
ggplot(complete_flakes, aes(x = as.factor(Spit), y = (cortex))) + | |
geom_boxplot() + # spread the overlapping points apart a little | |
theme_minimal() + | |
# geom_smooth(aes(group = 1)) + | |
xlab("Spit") + | |
ylab("Percent of dorsal surface with cortex") | |
# doesn't look like any interesting variation here... | |
# plot with years on x-axis | |
ggplot(complete_flakes, aes(x = cal_age_k_BP, y = (cortex))) + | |
geom_point(position = 'jitter') + # spread the overlapping points apart a little | |
theme_minimal() + | |
# geom_smooth(aes(group = 1)) + | |
xlab("x 1000 years cal BP") + | |
ylab("Percent of dorsal surface with cortex") | |
# seems like during 1-2 k cal BP there might be a drop in cortex, most flakes | |
# have zero % here, but then not many flakes in this period anyway. | |
## Now look at platform types | |
# clean and simplify data | |
complete_flakes$plattype <- complete_flakes$Platform.Type..plain..multiple..faceted..focalised..cortical..crushed. | |
# check how many types we have | |
unique(complete_flakes$plattype) | |
# we have multiple entries for plain, multiple, focalised and crushed... | |
# make all lower case and remove stray spaces | |
complete_flakes$plattype <- gsub("\\s", "", tolower(complete_flakes$plattype)) | |
# fix typos | |
complete_flakes$plattype <- ifelse(complete_flakes$plattype == 'muliple', | |
'multiple', | |
ifelse(complete_flakes$plattype == 'focalised', | |
'focalized', complete_flakes$plattype)) | |
# remove empty fields | |
complete_flakes <- complete_flakes[!(complete_flakes$plattype == ""),] | |
# now we've got rid of duplicates we can plot | |
# plot | |
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = plattype)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("Spit") + | |
ylab("Proportion of spit") + | |
scale_fill_discrete(name="Platform type") | |
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = plattype)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("x 1000 years cal BP") + | |
ylab("Proportion of Excavation Unit") + | |
scale_fill_discrete(name="Platform type") | |
# not a lot going on | |
## overhang removal | |
# clean and simplify data | |
complete_flakes$ohr <- complete_flakes$Platform.Preparation..Overhang. | |
unique(complete_flakes$ohr) | |
complete_flakes$ohr <- tolower(complete_flakes$ohr) | |
# remove ambiguous data | |
complete_flakes <- complete_flakes[!complete_flakes$ohr =="oh"] | |
# plot | |
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = ohr)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("Spit") + | |
ylab("Proportion of spit") + | |
scale_fill_discrete(name="Overhang removal") | |
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = ohr)) + | |
geom_bar(position = "fill") + | |
theme_minimal() + | |
xlab("x 1000 years cal BP") + | |
ylab("Proportion of Excavation Unit") + | |
scale_fill_discrete(name="Overhang removal") | |
# very consistently no overhang removal | |
# not a lot going on | |
############################################################################### | |
# Notes to self | |
# spit_depths from F:\My Documents\My UW\Research\1308 Sulawesi\spit sheets\Spit depths from spit sheets.xlsx | |
# interpolated_ages from F:\My Documents\My UW\Research\1308 Sulawesi\Dates\Talimbue_interpolated_calibrated_ages.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment