Skip to content

Instantly share code, notes, and snippets.

@Oreotrephes
Last active November 26, 2017 22:03
Show Gist options
  • Select an option

  • Save Oreotrephes/20532b0ad66aaffcb4aaac28f699f385 to your computer and use it in GitHub Desktop.

Select an option

Save Oreotrephes/20532b0ad66aaffcb4aaac28f699f385 to your computer and use it in GitHub Desktop.
#okay this seems to work fine:
#out of 816 rows to start with
#extracts corolla lengths for 611
#capsule lengths for 436
#both for 432
#I've checked the first 15 rows or so and spot check othres and this seems to be working.
#However, I'm don't know why the first function (capsule) seems to want greedy (ie. "capsule.*)
#while the second function (corolla) seems to want nongreedy (ie. "corolla.*?)
#changing either breaks the code...
#Contents:
#20-47: function to pull out capsule lengths and units
#50-81: function to pull out corolla lenghts and units
#85-98: code to test it against the "flora_of_china" sample csv
#100-125: code to apply each to the "flora4" full csv
text_process_fr <- function(string){
require(stringr)
#preprocessing
string <- gsub(" "," ",string)
string <- str_to_lower(string) #thank you Tyler!
#delete width (" x 10 cm") measurements if present
pattern <- "(capsule.*) x (?:ca\\. )*\\d*\\.?\\d*-?\\d*\\.?\\d* ([cm]m)" #define a regex search, call it pattern
strmatch<-str_match(string,regex(pattern)) #use the str_match function to apply that regex pattern to the text string
if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string
#delete high outliers ("(-15)") if present
pattern <- "(capsule.*)\\(-\\d*\\.?\\d*\\) ([cm]m)"
strmatch<-str_match(string,regex(pattern))
if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string
#delete low outliers ("(9-))" if present
pattern <- "(capsule.*)\\(\\d*\\.?\\d*-\\)(\\d*\\.?\\d*-?\\d*\\.?\\d* [cm]m)"
strmatch<-str_match(string,regex(pattern))
if(!is.na(strmatch[2])) string<-paste(strmatch[2],strmatch[3]) else string<-string
#finally get low and high range limits and units of measurement
pattern <- "capsule.* (\\d*\\.?\\d*)-?(\\d*\\.?\\d*) ([cm]m)"
strmatch<-str_match(string, regex(pattern,ignore_case = T))
output<-c(strmatch[2],strmatch[3],strmatch[4])
output
}
text_process_fl <- function(string){
#string<-flora[2,"input"]
#string<-"persisting to enclose mature capsule, sparsely reddish scaly, woolly, margin long ciliate; corolla funnelform-campan-ulate, white, white tinged pink, or rarely pale yellow, 1.5-2.2 cm; tube 7-10 mm; outer surface scaly; stamens 10, filaments pilose below; ovary densely woolly, scaly; style sharply deflex-ed, shorter than stamens, usually with a few hairs and scales at base. Capsule ovate or cylindric-ovate, 7-10 mm, densely wool-ly."
require(stringr)
#preprocessing
string <- gsub(" "," ",string)
string <- str_to_lower(string) #thank you Tyler!
#string
#delete width (" x 10 cm") measurements if present
pattern <- "(corolla.*?) x (?:ca\\. )*\\d*\\.?\\d*-?\\d*\\.?\\d* ([cm]m)" #define a regex search, call it pattern
strmatch<-str_match(string,regex(pattern)) #use the str_match function to apply that regex pattern to the text string
#strmatch
if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string
#delete high outliers ("(-15)") if present
pattern <- "(corolla.*?)\\(-\\d*\\.?\\d*\\) ([cm]m)"
strmatch<-str_match(string,regex(pattern))
if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string
#delete low outliers ("(9-))" if present
pattern <- "(corolla.*?)\\(\\d*\\.?\\d*-\\)(\\d*\\.?\\d*-?\\d*\\.?\\d* [cm]m)"
strmatch<-str_match(string,regex(pattern))
if(!is.na(strmatch[2])) string<-paste(strmatch[2],strmatch[3]) else string<-string
#finally get low and high range limits and units of measurement
pattern <- "corolla.*? (\\d*\\.?\\d*)-?(\\d*\\.?\\d*) ([cm]m)"
strmatch<-str_match(string, regex(pattern,ignore_case = T))
output<-c(strmatch[2],strmatch[3],strmatch[4])
output
}
###########
#testing it
#test it
flora_of_china <- read_csv("~/Downloads/flora of china.csv") #or, wherever it actually is
for(i in 1:dim(flora_of_china)[1]){
grist<-text_process_fr(flora_of_china[i,1])
flora_of_china$result_1[i]<-grist[1]
flora_of_china$result_2[i]<-grist[2]
flora_of_china$result_3[i]<-grist[3]
}
#############
#############
#applying it
flora <- read.csv("~/Downloads/flora4.csv") #or wherever it actually is
View(flora)
#apply it: capsule
for(i in 1:dim(flora)[1]){
grist<-text_process_fr(flora[i,"input"])
flora$fr_size_low[i]<-grist[1]
flora$fr_size_high[i]<-grist[2]
flora$fr_size_units[i]<-grist[3]
}
#apply it: corolla
for(i in 1:dim(flora)[1]){
grist<-text_process_fl(flora[i,"input"])
flora$fl_size_low[i]<-grist[1]
flora$fl_size_high[i]<-grist[2]
flora$fl_size_units[i]<-grist[3]
}
#to check
#write.table(flora,"flora_with_lengths.csv",sep=",")
#############
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment