Last active
November 26, 2017 22:03
-
-
Save Oreotrephes/20532b0ad66aaffcb4aaac28f699f385 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #okay this seems to work fine: | |
| #out of 816 rows to start with | |
| #extracts corolla lengths for 611 | |
| #capsule lengths for 436 | |
| #both for 432 | |
| #I've checked the first 15 rows or so and spot check othres and this seems to be working. | |
| #However, I'm don't know why the first function (capsule) seems to want greedy (ie. "capsule.*) | |
| #while the second function (corolla) seems to want nongreedy (ie. "corolla.*?) | |
| #changing either breaks the code... | |
| #Contents: | |
| #20-47: function to pull out capsule lengths and units | |
| #50-81: function to pull out corolla lenghts and units | |
| #85-98: code to test it against the "flora_of_china" sample csv | |
| #100-125: code to apply each to the "flora4" full csv | |
| text_process_fr <- function(string){ | |
| require(stringr) | |
| #preprocessing | |
| string <- gsub(" "," ",string) | |
| string <- str_to_lower(string) #thank you Tyler! | |
| #delete width (" x 10 cm") measurements if present | |
| pattern <- "(capsule.*) x (?:ca\\. )*\\d*\\.?\\d*-?\\d*\\.?\\d* ([cm]m)" #define a regex search, call it pattern | |
| strmatch<-str_match(string,regex(pattern)) #use the str_match function to apply that regex pattern to the text string | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string | |
| #delete high outliers ("(-15)") if present | |
| pattern <- "(capsule.*)\\(-\\d*\\.?\\d*\\) ([cm]m)" | |
| strmatch<-str_match(string,regex(pattern)) | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string | |
| #delete low outliers ("(9-))" if present | |
| pattern <- "(capsule.*)\\(\\d*\\.?\\d*-\\)(\\d*\\.?\\d*-?\\d*\\.?\\d* [cm]m)" | |
| strmatch<-str_match(string,regex(pattern)) | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2],strmatch[3]) else string<-string | |
| #finally get low and high range limits and units of measurement | |
| pattern <- "capsule.* (\\d*\\.?\\d*)-?(\\d*\\.?\\d*) ([cm]m)" | |
| strmatch<-str_match(string, regex(pattern,ignore_case = T)) | |
| output<-c(strmatch[2],strmatch[3],strmatch[4]) | |
| output | |
| } | |
| text_process_fl <- function(string){ | |
| #string<-flora[2,"input"] | |
| #string<-"persisting to enclose mature capsule, sparsely reddish scaly, woolly, margin long ciliate; corolla funnelform-campan-ulate, white, white tinged pink, or rarely pale yellow, 1.5-2.2 cm; tube 7-10 mm; outer surface scaly; stamens 10, filaments pilose below; ovary densely woolly, scaly; style sharply deflex-ed, shorter than stamens, usually with a few hairs and scales at base. Capsule ovate or cylindric-ovate, 7-10 mm, densely wool-ly." | |
| require(stringr) | |
| #preprocessing | |
| string <- gsub(" "," ",string) | |
| string <- str_to_lower(string) #thank you Tyler! | |
| #string | |
| #delete width (" x 10 cm") measurements if present | |
| pattern <- "(corolla.*?) x (?:ca\\. )*\\d*\\.?\\d*-?\\d*\\.?\\d* ([cm]m)" #define a regex search, call it pattern | |
| strmatch<-str_match(string,regex(pattern)) #use the str_match function to apply that regex pattern to the text string | |
| #strmatch | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string | |
| #delete high outliers ("(-15)") if present | |
| pattern <- "(corolla.*?)\\(-\\d*\\.?\\d*\\) ([cm]m)" | |
| strmatch<-str_match(string,regex(pattern)) | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2], strmatch[3]) else string<-string | |
| #delete low outliers ("(9-))" if present | |
| pattern <- "(corolla.*?)\\(\\d*\\.?\\d*-\\)(\\d*\\.?\\d*-?\\d*\\.?\\d* [cm]m)" | |
| strmatch<-str_match(string,regex(pattern)) | |
| if(!is.na(strmatch[2])) string<-paste(strmatch[2],strmatch[3]) else string<-string | |
| #finally get low and high range limits and units of measurement | |
| pattern <- "corolla.*? (\\d*\\.?\\d*)-?(\\d*\\.?\\d*) ([cm]m)" | |
| strmatch<-str_match(string, regex(pattern,ignore_case = T)) | |
| output<-c(strmatch[2],strmatch[3],strmatch[4]) | |
| output | |
| } | |
| ########### | |
| #testing it | |
| #test it | |
| flora_of_china <- read_csv("~/Downloads/flora of china.csv") #or, wherever it actually is | |
| for(i in 1:dim(flora_of_china)[1]){ | |
| grist<-text_process_fr(flora_of_china[i,1]) | |
| flora_of_china$result_1[i]<-grist[1] | |
| flora_of_china$result_2[i]<-grist[2] | |
| flora_of_china$result_3[i]<-grist[3] | |
| } | |
| ############# | |
| ############# | |
| #applying it | |
| flora <- read.csv("~/Downloads/flora4.csv") #or wherever it actually is | |
| View(flora) | |
| #apply it: capsule | |
| for(i in 1:dim(flora)[1]){ | |
| grist<-text_process_fr(flora[i,"input"]) | |
| flora$fr_size_low[i]<-grist[1] | |
| flora$fr_size_high[i]<-grist[2] | |
| flora$fr_size_units[i]<-grist[3] | |
| } | |
| #apply it: corolla | |
| for(i in 1:dim(flora)[1]){ | |
| grist<-text_process_fl(flora[i,"input"]) | |
| flora$fl_size_low[i]<-grist[1] | |
| flora$fl_size_high[i]<-grist[2] | |
| flora$fl_size_units[i]<-grist[3] | |
| } | |
| #to check | |
| #write.table(flora,"flora_with_lengths.csv",sep=",") | |
| ############# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment