Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active December 17, 2015 23:59
Show Gist options
  • Save benmarwick/5693206 to your computer and use it in GitHub Desktop.
Save benmarwick/5693206 to your computer and use it in GitHub Desktop.
ARCHY 299/499 lithics lab: snippets of R code for basic exploration of our lithic data
# R script for plotting of basic lithic data from Gua Mo'o hono, Sulawesi
#### get data
# this block will connect to the internet to access our google sheet
# then download it directly into a dataframe ready to work on
# If this is the first time, a little preparation is necessary:
# In google sheet: File -> Publish to web -> publish all sheets;
# Get link to published data -> CSV, all sheets, copy-paste link here...
# make a list of the links to all our sheets...
gid <- 0:7
goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=",i,"&output=csv"))
# access data from our links...
install.packages("RCurl") # only need to do this once per computer
require(RCurl)
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
myCSVs <- lapply(goog_sheets, function(i) getURL(i))
data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE))
# We now have a list object containing our data sheets, let's access
# each item of the list so we can work with them
basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit
complete_flake_mass <- data_sheets[[2]] # and so on...
broken_flake_mass <- data_sheets[[3]]
core_mass <- data_sheets[[4]]
retouch_GIUR <- data_sheets[[5]]
retouch_dims <- data_sheets[[6]]
retouch_notches <- data_sheets[[7]]
##### Let's look at the basic data and make a stacked bar plot
# subset just the main counts
basic_data <- na.omit(basic_data[, 2:7])
# change column names to be more general
names(basic_data) <- c("Spit", "non.artefacts", "cores", "complete.flakes", "broken.flakes", "debris")
# reshape the data from wide to long
library(reshape2)
basic_data_m <- melt(basic_data, id.var = "Spit")
# plot
library(ggplot2)
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) +
geom_bar( stat="identity") +
xlab("Excavtion unit") +
ylab("count") +
labs(fill = "artefact class")
# let's remove non-artefact and debris
basic_data_m <- basic_data_m[ !(basic_data_m[["variable"]] %in% c("debris", "non.artefacts")), ]
# plot just the remaining types
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) +
geom_bar( stat="identity") +
xlab("Excavtion unit") +
ylab("count") +
labs(fill = "artefact class")
# Are there any major changes in, say, the discard rates
# of complete flakes? In addition to visual insepction
# of the plot, we can use a statistical method
# called change point analysis
install.packages("ecp")
library(ecp)
x1 <- as.matrix(basic_data[["complete.flakes"]])
y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1)
y$estimates; y$cluster.number
# suggests a change in discard rates at spit 14
# a bayesian approach to change point analysis
library(bcp)
bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
plot(bcp1)
which(bcp1$posterior.prob > 0.6)
# indicates a few more spits (16, 18, etc.) where change in discard rates has occured
# how about cores?
x1 <- as.matrix(basic_data[["cores"]])
bcp1 <- bcp(x1, w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 500, return.mcmc = FALSE)
plot(bcp1)
which(bcp1$posterior.prob > 0.8)
# yes about spit 18
#### Now let's look at the mass data
# let's explore the complete flakes...
# make a table of Tukey's five number
# summary (minimum, lower-hinge, median, upper-hinge, maximum)
cf <- do.call(data.frame, aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), fivenum))
names(cf) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
# view the table
cf
# the median for most spits is less than 1 g
# let's plot to visualise that table
# each dot is a flake
ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
geom_point() +
ylab("mass (g)") +
xlab("Spit")
# if we take the log values of the mass we can
# see the spread better and detect trends better
# adding a smoother line helps too
ggplot(complete_flake_mass, aes(factor(Spit), log(CFLA..g.))) +
geom_point() +
ylab("mass (g)") +
xlab("Spit") +
geom_smooth(aes(group = 1))
# seems like flakes in spits 1-10
# are heavier than the others
# plot summary stats of those data
# we'll trim the y-axis to minimise the effect
# of outliers
ggplot(complete_flake_mass, aes(factor(Spit), CFLA..g.)) +
geom_boxplot() + # geom_violin() is nice too
ylab("mass (g)") +
xlab("Spit") +
ylim(0,2) # this adjusts the y-axis limits
# looks like spits 6-9 are bigger than
# other spits
# So the exploratory data anlysis has suggested
# that the upper spits might be bigger than the
# lower spits. We can test this with ANOVA if the
# varience of each group (Spit) is the same.
# Let's check that:
complete_flake_mass$Spit <- as.factor(complete_flake_mass$Spit)
bartlett.test(CFLA..g. ~ Spit, data = complete_flake_mass[-(1:4),])
# p < 0.05, so varience is not the same, we should not
# use ANOVA because of its assumptions has been violated
# Instead we use K-sample permutation test, which is good for unequal variances
library(coin)
oneway_test(as.numeric(CFLA..g.) ~ Spit, data = complete_flake_mass, distribution=approximate(B=10000))
# with a p value greater than 0.05 we have an indication that
# there are probably no significant non-random differences in
# complete flake masses between spits
# consider spread of complete flake masses by spit
cf <- na.omit(aggregate(data = complete_flake_mass, CFLA..g. ~ factor(Spit), sd))
names(cf) <- c("Spit", "sd")
# view the table
cf
# plot it
ggplot(cf, aes(Spit, sd)) +
geom_point()
# seems like spit 19 has a very high spread of masses relative to the others
##### repeat for core flake mass
# make a table of Tukey's five number
# summary (minimum, lower-hinge, median, upper-hinge, maximum)
co <- do.call(data.frame, aggregate(data = core_mass, Core..g. ~ factor(Spit), fivenum))
names(co) <- c("Spit", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
# view the table
co
# the median for most spits is ~3 g
# let's plot to visualise that table
# each dot is a flake
ggplot(core_mass, aes(factor(Spit), Core..g.)) +
geom_point() +
ylab("mass (g)") +
xlab("Spit")
# if we take the log values of the mass we can
# see the spread better and detect trends better
# adding a smoother line helps too
ggplot(core_mass, aes(factor(Spit), log(Core..g.))) +
geom_point() +
ylab("mass (g)") +
xlab("Spit") +
geom_smooth(aes(group = 1))
# seems like flakes in spits 27-28
# are heavier than the others
# plot summary stats of those data
# we'll trim the y-axis to minimise the effect
# of outliers
ggplot(core_mass, aes(factor(Spit), Core..g.)) +
geom_boxplot() + # geom_violin() is nice too
ylab("mass (g)") +
xlab("Spit") +
ylim(0,5) # this adjusts the y-axis limits
# So the exploratory data anlysis has suggested
# that the upper spits might be bigger than the
# lower spits. We can test this with ANOVA if the
# varience of each group (Spit) is the same.
# Let's check that:
core_mass$Spit <- as.factor(core_mass$Spit)
bartlett.test(Core..g. ~ Spit, data = core_mass[-c(1:8,25),])
# p < 0.05, so varience is not the same, we should not
# use ANOVA because of its assumptions has been violated
# Instead we use K-sample permutation test, which is good for unequal variances
library(coin)
oneway_test(as.numeric(Core..g.) ~ Spit, data = core_mass, distribution=approximate(B=10000))
# with a p value greater than 0.05 we have an indication that
# there are probably no significant non-random differences in
# complete flake masses between spits
# consider spread of complete flake masses by spit
co <- na.omit(aggregate(data = core_mass, Core..g. ~ factor(Spit), sd))
names(co) <- c("Spit", "sd")
# view the table
co
# plot it
ggplot(co, aes(Spit, sd)) +
geom_point()
# seems like spits 9 and 29 have a very high spread of masses relative to the others
# investigate changes in ratios of complete flakes and broken flakes
ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.broken.flakes))
ratio <- data.frame(Spit = basic_data$Spit[1:32], ratio = ratio)
library(ggplot2)
ggplot(ratio, aes(Spit, ratio)) +
geom_point() +
geom_smooth() +
ylab("ratio of complete flakes to broken flakes")
# correlation
with(ratio, cor.test(Spit, ratio))
# investigate changes in ratios of complete flakes and cores
ratio <- na.omit(with(basic_data, Number.of.complete.flakes/Number.of.cores))
ratio <- data.frame(Spit = basic_data$Spit[1:31], ratio = ratio)
library(ggplot2)
ggplot(ratio, aes(Spit, ratio)) +
geom_bar(stat="identity") +
geom_smooth() +
ylab("ratio of complete flakes to cores")
# correlation
with(ratio, cor.test(Spit, ratio))
##################################################################
#
# GIUR data, reformatted...
require(RCurl)
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
GIUR <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=6&output=csv"
GIUR <- read.csv(textConnection(getURL(GIUR)), stringsAsFactors = FALSE)
# just get newly formatted GIUR data
GIURd <- GIUR[1:22,-1]
# extract T and t
Tval <- GIURd[seq(1, length(GIURd), by=2)]
tval <- GIURd[seq(2, length(GIURd), by=2)]
# give matching row names (no necessary)
# names(Tval) <- as.numeric(unlist(regmatches(names(Tval), gregexpr('\\(?[0-9,.]+', names(Tval) ))))
# names(tval) <- as.numeric(unlist(regmatches(names(tval), gregexpr('\\(?[0-9,.]+', names(tval) ))))
# calculate GIUR per zone, per artefact
# see here for details: http://www.sciencedirect.com.offcampus.lib.washington.edu/science/article/pii/S0305440309000284
GIURz <- sapply(1:10, function(i) as.numeric(tval[,i]) / as.numeric(Tval[,i]) )
# calculate total GIUR for each artefact
GIURz[is.na(GIURz)] <- 0 # replace NAs with zeros
# look at distribution across zones
library(ggplot2); library(reshape2)
mlt <- melt(GIURz)
ggplot(mlt, aes(Var2, value))+
geom_bar(stat="identity")
# zones 4 and 5 get the most retouching...
# identical to above, sanity check
mlt_agg <- aggregate(value ~ Var2, mlt[,2:3], sum)
ggplot(mlt_agg, aes(Var2, value))+
geom_bar(stat="identity")
# summarise by spit
GIURt <- rowSums(GIURz)/10
# get spit numbers back in there
spit <- as.numeric(unlist(regmatches(GIUR[1:22,1], gregexpr('\\(?[0-9,.]+', GIUR[1:22,1] ))))
GIURT <- data.frame(spit = spit, GIUR = GIURt)
# plot
library(ggplot2)
ggplot(GIURT, aes(spit, GIUR)) +
geom_point() +
geom_smooth() +
ylim(0,1)
# another kind of plot...
ggplot(GIURT, aes(as.factor(spit), GIUR)) +
geom_boxplot() +
ylim(0,1)
# now get average GIUR per spit
GIURav <- aggregate(GIUR ~ spit, GIURT, mean)
# plot
ggplot(GIURav, aes(spit, GIUR)) +
geom_point() +
geom_smooth() +
ylim(0,1)
#############################################################
# metrics of retouched flakes
require(RCurl)
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
met <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldHA0d2YtNHNQOWUydHF6cEt1eGVNZnc&single=true&gid=5&output=csv"
met <- read.csv(textConnection(getURL(met)), stringsAsFactors = FALSE)
met[is.na(met)] <- 0 # replace NAs with zeros
met <- met[,1:5]; met[met=="n/a" | met==""] <- 0; met$Max..Dimension.mm. <- as.numeric(met$Max..Dimension.mm.)
library(data.table)
met <- data.table(met)
# EL = length + maximum width + maximum dimension
# EL/M, M = mass in g, we'll call it EL_M because slashes are special characters in R
# http://www.academia.edu/235080/A_method_for_estimating_edge_length_from_flake_dimensions_use_and_implications_for_technological_change_in_the_MSA
met$EL <- with(met, Length..mm. + Max..Dimension.mm. + Max..Width..mm.)
met$EL_M <- with(met, EL / Mass..g.)
# average mass, length, etc per spit
met_av <- met[,lapply(.SD,mean),by="SPIT"]
# look for correlations between GIUR and size
merg <- merge( GIURav, met_av, by.x = "spit", by.y = "SPIT")
with(merg, cor.test( GIUR, Mass..g.))
with(merg, cor.test( GIUR, EL_M))
# look for correlation between GIUR and discard rates
cf <- na.omit(basic_data[,c("Spit", "Number.of.complete.flakes")])
mergcf <- merge( merg, cf, by.x = "spit", by.y = "Spit")
with(mergcf, cor.test( GIUR, Number.of.complete.flakes))
# Correlation is the most commonly
# used statistical technique to investigate
# any kind of data. A correlation is a
# single number that describes the degree
# of relationship between two variables,
# which we can label X and Y, just for
# example. Correlations can suggest possible
# causal relationships but the statistical
# test alone is not sufficient to demonstrate
# the relationship, you also need explain it
# with some logic and details of the likely
# causal connection. The answer to the
# question of how much are the two variables
# correlated comes from two numbers:
#
# r, or the coefficient of correlation. r
# takes on a value in the range from -1 to 1.
# This measure will show whether the correlation
# between variables is strong (close to 1 or -1),
# weak or non-existent (zero or nearly zero)
# and indicate the direction of this correlation.
# A positive value for r means that larger
# X values go along with larger Y values while
# a negative r value means when X values get
# larger, the Y values get smaller.
# # p or the probability value. This value
# p tells you the probability of the r-value
# in a collection of random data in which you'd
# have no reason to suspect any relationship. A
# p-value of 0.05 indicates a 5% chance that
# results you are seeing would have come up in
# a random dataset, so you can say with a 95%
# probability of being correct that you are
# observing a real relationship in your data.
# Scholarly convention tends to hold p values
# of less than 0.05 as indicators of a
# statistically signficant relationship. Above
# 0.05 is not significant.
#
# Note that the size of the p-value in your
# correlation analysis says nothing about the
# strength of the relationship between your
# two variables - it is possible to have a
# highly significant result (very small p-value,
# eg. p = 0.001) for a miniscule effect (ie.
# an r value close to zero) that might not
# be of any real-world importance or archaeological
# meaning.
#
# You need to use your common sense
# when interpreting your correlation data; don't
# just mindlessly copy and paste them and get
# excited when r is big or p-values
# are small. Ensure that you're only
# getting excited about data that make
# sense in the real world.
#
# We will calculate the correlation coeffient
# to show on the plot and get the probability
# value of the null hypothesis (that the
# correlation is due to random
# processes that we don't care about). This value will
# help us test the hypothesis that
# the overall slope of the linear
# regression in 0 (ie. there is no
# relationship between the two variables).
# A line with slope 0 is horizontal,
# which means that Y does not depend on X at all.
#
# Remember that a correlation is considered to be interesting
# only when the p value is less than 0.05. If it's greater
# than 0.05, then the correlation is most likely due to
# chance and not suggestive of anything interesting.
#
# see for something to cite in your written work:
# http://uwashington.worldcat.org.offcampus.lib.washington.edu/oclc/701053523
# R script for plotting of basic lithic data from Talimbue, Sulawesi
#### get data
# this block will connect to the internet to access our google sheet
# then download it directly into a dataframe ready to work on
# If this is the first time, a little preparation is necessary:
# In google sheet: File -> Publish to web -> publish all sheets;
# Get link to published data -> CSV, all sheets, copy-paste link here...
# make a list of the links to all our sheets...
gid <- c(0,3,6,7)
goog_sheets <- lapply(gid, function(i) paste0("https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldFFfcXYtNURINWtlOGtFbFhwX3hXTGc&single=true&gid=",i,"&output=csv"))
# access data from our links...
# install.packages("RCurl") # only need to do this once per computer
require(RCurl)
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
myCSVs <- lapply(goog_sheets, function(i) getURL(i))
data_sheets <- lapply(myCSVs, function(i) read.csv(textConnection(i), stringsAsFactors = FALSE))
# We now have a list object containing our data sheets, let's access
# each item of the list so we can work with them
basic_data <- data_sheets[[1]] # first sheet is basic data of counts per spit
complete_flakes <- data_sheets[[2]] # and so on...
spit_depths <- data_sheets[[3]]
interpolated_ages <- data.frame(lapply(data_sheets[[4]], as.numeric))
#### Before we get into the lithic data, let's look at the radiocarbon dates
# so we can determine the ages of interesting changes in the lithic sequence
# first run all the lines at the end of this file in the 'end matter'
# to load the depth-age data
# round depths to ensure a match with depths in the other table
spit_depths$depth_m <- as.numeric(round(spit_depths$depth_m, 2) - 1.61)
spit_depths$spit <- as.numeric(spit_depths$spit)
# merge the dates, spits and depths together
library(data.table)
dt1<-data.table(spit_depths, key="depth_m")
dt2<-data.table(interpolated_ages, key="depth_m")
# join
joined.dt1.dt.2 <- dt1[dt2]
# remove spits that are NA
joined.dt1.dt.2 <- joined.dt1.dt.2[!is.na(joined.dt1.dt.2$spit),]
# merge dates with artefact data
basic_data$spit <- as.numeric(basic_data$Spit)
basic_data <- data.table(basic_data, key = "spit")
setkey(joined.dt1.dt.2, key = "spit")
basic_data <- basic_data[joined.dt1.dt.2]
# keep only spits for which we have data
basic_data <- basic_data[!is.na(basic_data$Spit),]
# merge with complete flake data
complete_flakes$spit <- as.numeric(complete_flakes$Spit)
complete_flakes <- data.table(complete_flakes, key = "spit")
setkey(joined.dt1.dt.2, key = "spit")
complete_flakes <- complete_flakes[joined.dt1.dt.2, allow.cartesian=TRUE]
# keep only spits for which we have data
complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),]
#### Let's look at the basic data and make a stacked bar plot
# subset just the main counts
basic_data1 <- basic_data[, names(basic_data) %in% c('Spit', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE]
# change column names to be more general
setnames(basic_data1, c("Spit", "broken.flakes", "complete.flakes", "cores", "retouched.pieces"))
# put it in order by spit
basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit)
# reshape the data from wide to long
library(reshape2)
basic_data_m <- melt(basic_data1, id.var = "Spit")
# plot of overall discard rates per spit
library(ggplot2)
ggplot(basic_data_m, aes(factor(Spit), value, fill = factor(variable))) +
geom_bar( stat="identity") +
xlab("Excavtion unit") +
ylab("count") +
labs(fill = "artefact class") +
theme_minimal()
# plot again, this time by age instead of spit
# subset just the main counts
basic_data2 <- basic_data[, names(basic_data) %in% c('cal_age_k_BP', 'Broken.flake', 'complete.flake', 'Core', 'Retouched.piece'), ,with = FALSE]
# change column names to be more general
setnames(basic_data2, c("broken.flakes", "complete.flakes", "cores",
"retouched.pieces", "cal_age_k_BP"))
# put it in order by spit
#basic_data1$Spit <- factor(basic_data1$Spit, levels=basic_data1$Spit)
# reshape the data from wide to long
library(reshape2)
basic_data_m <- melt(basic_data2, id.var = "cal_age_k_BP")
# plot
library(ggplot2)
ggplot(basic_data_m, aes(cal_age_k_BP, value, fill = factor(variable))) +
geom_bar( stat="identity") +
xlab("x 1000 cal years BP") +
ylab("count") +
labs(fill = "artefact class") +
theme_minimal()
# Are there any major changes in, say, the discard rates
# of complete flakes? In addition to visual insepction
# of the plot, we can use a statistical method
# called change point analysis
# install.packages("ecp")
library(ecp)
# let's look at the total number of artefacts per spit
totals <- aggregate(value ~ cal_age_k_BP, basic_data_m, FUN=sum)
x1 <- as.matrix(totals)
class(x1) <- "numeric"
y <- e.divisive(X=x1,sig.lvl=0.05,R=499,eps=1e-3,half=1000,k=NULL,min.size=3,alpha=1)
y$estimates # what row has a significant change? Ignore if it's just the first and last
# a bayesian approach to change point analysis - just another way
library(bcp)
bcp1 <- bcp(x1[,2], w0 = 0.2, p0 = 0.2, burnin = 50, mcmc = 5000, return.mcmc = FALSE)
plot(bcp1)
legacyplot(bcp1)
summary(bcp1)
which(bcp1$posterior.prob > 0.6)
# pretty flat...
#### Now let's look at some of the individual flake variables
## start with mass
# make a table of Tukey's five number
# summary (minimum, lower-hinge, median, upper-hinge, maximum)
cf <- do.call(data.frame, aggregate(data = complete_flakes, Mass..g. ~ cal_age_k_BP, fivenum))
names(cf) <- c("cal_age_k_BP", "minimum", "lower-hinge", "median", "upper-hinge", "maximum")
# view the table
cf
# the median for most spits is less than 1 g
# let's make a plot to visualise that table
# each dot is a flake
ggplot(complete_flakes, aes(cal_age_k_BP, Mass..g.)) +
geom_point() +
ylab("mass (g)") +
xlab("x 1000 cal years BP") +
theme_minimal()
# if we take the log values of the mass we can
# see the spread better and detect trends better
# adding a smoother line helps too
ggplot(complete_flakes, aes(cal_age_k_BP, log(Mass..g.))) +
geom_point() +
ylab("log mass (g)") +
xlab("x 1000 cal years BP") +
geom_smooth(aes(group = 1)) +
theme_minimal()
# seems like flakes in spits 1-10
# are lighter than the others
# So the exploratory data anlysis has suggested
# that the upper spits might be bigger than the
# lower spits. We can test this with ANOVA if the
# varience of each group (Spit) is the same.
# Let's check that:
complete_flakes$Spit <- as.factor(complete_flakes$Spit)
bartlett.test(Mass..g. ~ Spit, data = complete_flakes)
# p < 0.05, so varience is not the same, we should not
# use ANOVA because of its assumptions has been violated
# Instead we use K-sample permutation test, which is good for unequal variances
library(coin)
oneway_test(as.numeric(Mass..g.) ~ Spit, data = complete_flakes, distribution=approximate(B=10000))
# with a p value greater than 0.05 we have an indication that
# there are probably no significant non-random differences in
# complete flake masses between spits
# consider spread of complete flake masses by spit
cf <- na.omit(aggregate(data = complete_flakes, Mass..g. ~ factor(Spit), sd))
names(cf) <- c("Spit", "sd")
# view the table
cf
# plot it
ggplot(cf, aes(Spit, sd)) +
geom_point()
# pretty random, no reason to believe there are significant changes in variation
# of flake mass over time
# Now look into some of the categorical variables...
## raw material
# clean data
complete_flakes$Raw.Material <- gsub("\\s", "", complete_flakes$Raw.Material)
# get rid of NAs
complete_flakes <- complete_flakes[!is.na(complete_flakes$Spit),]
# plot
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = Raw.Material)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("Spit") +
ylab("Proportion of spit") +
scale_fill_discrete(name="Raw material")
# looks like some interesting pattern with red vs black chert, let's check the
# timing of that
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = Raw.Material)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("x 1000 years cal BP") +
ylab("Proportion of Excavation Unit") +
scale_fill_discrete(name="Raw material")
# seems like from 4-6 ky BP there was a preference for red chert, and then
# after that black chert
## Amount of cortex per flake
# clean data
complete_flakes$cortex <- as.numeric(gsub("%", "", complete_flakes$X..Cortex..increments.of.10..))
# plot % cortex per spit
ggplot(complete_flakes, aes(x = as.factor(Spit), y = cortex)) +
geom_point(position = 'jitter') + # spread the overlapping points apart a little
theme_minimal() +
geom_smooth(aes(group = 1)) +
xlab("Spit") +
ylab("Percent of dorsal surface with cortex")
# plot with log scale
ggplot(complete_flakes, aes(x = as.factor(Spit), y = log(cortex))) +
geom_point(position = 'jitter') + # spread the overlapping points apart a little
theme_minimal() +
# geom_smooth(aes(group = 1)) +
xlab("Spit") +
ylab("Percent of dorsal surface with cortex")
# box and whisker plot
ggplot(complete_flakes, aes(x = as.factor(Spit), y = (cortex))) +
geom_boxplot() + # spread the overlapping points apart a little
theme_minimal() +
# geom_smooth(aes(group = 1)) +
xlab("Spit") +
ylab("Percent of dorsal surface with cortex")
# doesn't look like any interesting variation here...
# plot with years on x-axis
ggplot(complete_flakes, aes(x = cal_age_k_BP, y = (cortex))) +
geom_point(position = 'jitter') + # spread the overlapping points apart a little
theme_minimal() +
# geom_smooth(aes(group = 1)) +
xlab("x 1000 years cal BP") +
ylab("Percent of dorsal surface with cortex")
# seems like during 1-2 k cal BP there might be a drop in cortex, most flakes
# have zero % here, but then not many flakes in this period anyway.
## Now look at platform types
# clean and simplify data
complete_flakes$plattype <- complete_flakes$Platform.Type..plain..multiple..faceted..focalised..cortical..crushed.
# check how many types we have
unique(complete_flakes$plattype)
# we have multiple entries for plain, multiple, focalised and crushed...
# make all lower case and remove stray spaces
complete_flakes$plattype <- gsub("\\s", "", tolower(complete_flakes$plattype))
# fix typos
complete_flakes$plattype <- ifelse(complete_flakes$plattype == 'muliple',
'multiple',
ifelse(complete_flakes$plattype == 'focalised',
'focalized', complete_flakes$plattype))
# remove empty fields
complete_flakes <- complete_flakes[!(complete_flakes$plattype == ""),]
# now we've got rid of duplicates we can plot
# plot
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = plattype)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("Spit") +
ylab("Proportion of spit") +
scale_fill_discrete(name="Platform type")
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = plattype)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("x 1000 years cal BP") +
ylab("Proportion of Excavation Unit") +
scale_fill_discrete(name="Platform type")
# not a lot going on
## overhang removal
# clean and simplify data
complete_flakes$ohr <- complete_flakes$Platform.Preparation..Overhang.
unique(complete_flakes$ohr)
complete_flakes$ohr <- tolower(complete_flakes$ohr)
# remove ambiguous data
complete_flakes <- complete_flakes[!complete_flakes$ohr =="oh"]
# plot
ggplot(complete_flakes, aes(x = as.factor(Spit), fill = ohr)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("Spit") +
ylab("Proportion of spit") +
scale_fill_discrete(name="Overhang removal")
ggplot(complete_flakes, aes(x = cal_age_k_BP, fill = ohr)) +
geom_bar(position = "fill") +
theme_minimal() +
xlab("x 1000 years cal BP") +
ylab("Proportion of Excavation Unit") +
scale_fill_discrete(name="Overhang removal")
# very consistently no overhang removal
# not a lot going on
###############################################################################
# Notes to self
# spit_depths from F:\My Documents\My UW\Research\1308 Sulawesi\spit sheets\Spit depths from spit sheets.xlsx
# interpolated_ages from F:\My Documents\My UW\Research\1308 Sulawesi\Dates\Talimbue_interpolated_calibrated_ages.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment