Last active
June 2, 2020 23:18
-
-
Save ibartomeus/6a00a9345d295dd626b8dfeaa093475c to your computer and use it in GitHub Desktop.
Ideas on how to parse data stored as text with complex structure.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#question: Can we create an heuristic to parse this type of data: | |
#Input example: | |
Halictus crenicornis | |
GALICIA: 1♀, Monte do Gozo, Santiago de Compostela (La Coruña), 350 m, 5.VIII.2016, 29TNH404481. – 1♀, Río Castro, Cerdedo (Pontevedra), 20.VII.1996. – 1♀, Oca (Pontevedra), 20.VII.1996. | |
ASTURIAS: 1♂, 1♀, Raitán, Carreño (Asturias), 130 m, 30TTP72, 17.VIII.2005. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 13.V.2015, sobre flor de Centaurea nigra, C. Guardado leg. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 27.V.2014, M. Miñarro leg. – 1♀, Muñiz (Asturias), 14.VII.2016, sobre flor de Taraxacum, D. Luna leg. | |
#Desired output (csv): | |
Species, CCAA, female, male, locality, province, elevation, date, UTM, latitude, longitude, notes | |
Halictus crenicornis, Galicia, 1, 0, "Monte do Gozo, Santiago de Compostela", La Coruña, 350, 5.VIII.2016, 29TNH404481, NA, NA, NA | |
Halictus crenicornis, Galicia, 1, 0, "Río Castro, Cerdedo", Pontevedra, NA , 20.VII.1996, NA, NA, NA, NA | |
Halictus crenicornis, Galicia, 1, 0, "Oca", Pontevedra, NA, 20.VII.1996, NA, NA, NA, NA | |
Halictus crenicornis, Asturias, 1, 1, "Raitán, Carreño", Asturias, 130, 17.VIII.2005, 30TTP72, NA, NA, NA | |
Halictus crenicornis, Asturias, 1, 0, "Poreño, Villaviciosa", Asturias, NA, 13.V.2015, NA, 43.426443, -5.445950, "sobre flor de Centaurea nigra, C. Guardado leg." | |
Halictus crenicornis, Asturias, 1, 0, "Poreño, Villaviciosa", Asturias, NA, 27.V.2014, NA, 43.426443, -5.445950, "M. Miñarro leg." | |
Halictus crenicornis, Asturias, 1, 0, "Muñiz", Asturias, NA, 14.VII.2016, NA, NA, NA, "sobre flor de Taraxacum, D. Luna leg." | |
#There are more than 1000 entries, so doing it by hand may be painful. | |
#Ideas: Use regexp to... | |
Detect CCAA as "TEXT:" | |
Detect sex as "0♂ , 0♀" (♂ = male, ♀ = female) | |
Detect Province as "(text)" | |
Detect elevation as "0m." | |
Detect date as "00.TEXT.0000" | |
Detect UTM as "00TEX00" | |
Detect latitude, longitude as "00,00000º, -0,00000º" | |
Locality, notes, and missing pieces of data would be harder. No idea how to do that. | |
Reordering multi records by columns, preserving CCAA, and species, it's also challenging. We can detect the split by "–". | |
I am sure erratas and inconsistencies are likely to emerge along the 1000 entries, just by chance. | |
#final question: Is that doable programatically? or is better going by hand? | |
#Here as a dirty option, but works: | |
text <- c("Halictus crenicornis", | |
"GALICIA: 1♀, Monte do Gozo, Santiago de Compostela (La Coruña), 350 m, 5.VIII.2016, 29TNH404481. – 1♀, Río Castro, Cerdedo (Pontevedra), 20.VII.1996. – 1♀, Oca (Pontevedra), 20.VII.1996.", | |
"ASTURIAS: 1♂, 1♀, Raitán, Carreño (Asturias), 130 m, 30TTP72, 17.VIII.2005. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 13.V.2015, sobre flor de Centaurea nigra, C. Guardado leg. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 27.V.2014, M. Miñarro leg. – 1♀, Muñiz (Asturias), 14.VII.2016, sobre flor de Taraxacum, D. Luna leg.") | |
#a generic function: | |
parse_records <- function(text){ | |
species <- text[which(nchar(text) < 30)] #so far assume a file per species. | |
rest <- text[which(nchar(text) > 30)] | |
CCAA <- unlist(strsplit(rest, split = ":"))[seq(1,length(rest)*2, 2)] | |
obs <- unlist(strsplit(rest, split = ":"))[seq(2,length(rest)*2, 2)] | |
occ_list <- strsplit(obs, split = "–", fixed = TRUE) | |
out <- data.frame(raw = NA, CCAA = NA, | |
females = NA, males = NA, provinces = NA, eleveations = NA, | |
dates = NA, UTMs = NA , | |
lats = NA, longs = NA, localities = NA) | |
for(i in 1:length(occ_list)){ #CCAA matches list number. | |
females_index <- gregexpr("[0-9]+♀", occ_list[[i]]) | |
females <- substr(occ_list[[i]], start = unlist(females_index), | |
stop = unlist(lapply(females_index, attributes))) | |
females <- ifelse(females == "", 0, females) | |
males_index <- gregexpr("[0-9]+♂", occ_list[[i]]) | |
males <- substr(occ_list[[i]], start = unlist(males_index), | |
stop = unlist(lapply(males_index, attributes))) | |
males <- ifelse(males == "", 0, males) | |
province_index1 <- regexpr("[(]", occ_list[[i]]) | |
province_index2 <- regexpr("[)]", occ_list[[i]]) | |
provinces <- substr(occ_list[[i]], start = province_index1+1, | |
stop = province_index2-1) | |
elev_index <- regexpr("[0-9]+ m", occ_list[[i]]) | |
eleveations <- substr(occ_list[[i]], start = elev_index, | |
stop = elev_index+attr(elev_index,"match.length")-3) | |
eleveations <- ifelse(eleveations == "", NA, eleveations) | |
date_index <- regexpr("[0-9]+[.][A-Z]+[.][0-9]+", occ_list[[i]]) | |
dates <- substr(occ_list[[i]], start = date_index, | |
stop = date_index+attr(date_index,"match.length")-1) | |
UTM_index <- regexpr("[0-9]+[A-Z]+[0-9]+", occ_list[[i]]) | |
UTMs <- substr(occ_list[[i]], start = UTM_index, | |
stop = UTM_index+attr(UTM_index,"match.length")-1) | |
UTMs <- ifelse(UTMs == "", NA, UTMs) | |
lat_long_index1 <- regexpr("[0-9]+,[0-9]+º", occ_list[[i]]) #not optimal... | |
lat_long_index2 <- regexpr("-[0-9]+,[0-9]+º", occ_list[[i]]) #not optimal... | |
lats <- substr(occ_list[[i]], start = lat_long_index1, | |
stop = lat_long_index1+attr(lat_long_index1,"match.length")-2) | |
longs <- substr(occ_list[[i]], start = lat_long_index2, | |
stop = lat_long_index2+attr(lat_long_index2,"match.length")-2) | |
lats <- gsub(",", ".", lats, fixed = TRUE) | |
longs <- gsub(",", ".", longs, fixed = TRUE) | |
lats <- ifelse(lats == "", NA, lats) | |
longs <- ifelse(longs == "", NA, longs) | |
locality_index1 <- regexpr("♀,|♂,", occ_list[[i]]) #this will mess up whem both are present.Alas | |
locality_index2 <- regexpr("[(]", occ_list[[i]]) | |
localities <- substr(occ_list[[i]], start = locality_index1+3, | |
stop = locality_index2-2) | |
# notes: whatever is left, do not worry | |
ccaaloop <- data.frame(raw = occ_list[[i]], CCAA = rep(CCAA[i],length(occ_list[[i]])), | |
females, males, provinces, eleveations, dates, UTMs, | |
lats, longs, localities) #change "" for 0 (males) or NA (Others) | |
out <- rbind(out, ccaaloop) | |
} | |
out$species <- rep(species, nrow(out)) | |
out <- out[-1,] | |
out | |
} | |
parse_records(text) | |
#try with a longer dataset: | |
text2 <- c("Halictus crenicornis" | |
,"GALICIA: 1♀, Monte do Gozo, Santiago de Compostela (La Coruña), 350 m, 5.VIII.2016, 29TNH404481. – 1♀, Río Castro, Cerdedo (Pontevedra), 20.VII.1996. – 1♀, Oca (Pontevedra), 20.VII.1996." | |
,"ASTURIAS: 1♀, Raitán, Carreño (Asturias), 130 m, 30TTP72, 17.VIII.2005. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 13.V.2015, sobre flor de Centaurea nigra, C. Guardado leg. – 1♀, Poreño, Villaviciosa (Asturias), 43,426443º, -5,445950º, 27.V.2014, M. Miñarro leg. – 1♀, Muñiz (Asturias), 14.VII.2016, sobre flor de Taraxacum, D. Luna leg." | |
,"NAVARRA: 2♀, Javier (Navarra), 8.VII.2008, 450 m, 30TXN4618. – 1♀, Foz de Lumbier, río Irati (Navarra), 445 m, 30TXN3822, 8.VII.2008." | |
,"ARAGÓN: 1♂, 1♀, Grañén, Los Monegros (Huesca), 30TYM1847, 19.VII.2007. – 1♂, Collado de Santa Bárbara, Torralba de los Sisones (Teruel), 1080 m, 30TXL32, 12.VII.2009, L. Castro leg." | |
,"CASTILLA Y LEÓN: 1♂, 1♀, La Zarza, Sierra de Gredos (Ávila), 1200 m, 30TTK7666, 28.VII.2008. – 1♀, Río Aravalle, Sierra de Gredos (Umbrías, Ávila), 1040 m, 30TTK8168, 28.VII.2008. – 1♀, Ermita de Chilla, Sierra de Gredos, Candeleda (Ávila), 710 m, 30TUK0450, 13.VI.2009. – 1♀, Hoyo Casero, río Alberche, Sierra de Gredos (Ávila), 1215 m, 30TUK3271, 26.VII.2008. – 1♀, Junciana, Sierra de Gredos, El Barco (Ávila), 1010 m, 30TTK8376, 28.VII.2008. – 1♂, 1♀, Puerto Manzanal, Montes de León (León), 1200 m, 23.VII.1996. – 1♀, Hontanares de Eresma, Sierra de Guadarrama (Segovia), 920 m, 30TUL9838, 29.VII.2008. – 1♀, Madrona (Segovia), 960 m, 30TVL007286, 22.VIII.2010. – 1♂, Portillo (Valladolid), 6.VI.2015, sobre Rubus sp., L.O. Aguado leg." | |
,"MADRID: 1♀, Corpa (Madrid), 810 m, 30TVK77, 15.VI.2008, L. Castro leg. – 1♀, Collado Mediano, Sierra de Guadarrama (Madrid), 1075 m, 30TUL1404, 27.VIII.2011." | |
,"CASTILLA LA MANCHA: 1♀, Elche de la Sierra (Albacete), 700 m, 30SWH8758, 24.IV.2005. – 1♀, Balneario, Santa Cruz de Mudela (Ciudad Real), 730 m, 30SVH6179, 27.IV.2008. – 1♀, Casa Pastor, Valle de Alcudia, Sierra Madrona (Ciudad Real), 600 m, 30SUH87, 1.V.1994. – 1♀, Carboneras de Guadazaón (Cuenca), 1030 m, 30SXK0221, 16.VI.2009. – 1♀, Almodóvar del Pinar (Cuenca), 1050 m, 30SWK9104, 16.V.2009. – 1♀, Embalse de Alarcón, río Júcar, Alarcón (Cuenca), 30SWJ766800, 23.VI.2016. – 1♀, Embalse Bolarque, Sayatón (Guadalajara), 29.III.1996." | |
,"COMUNIDAD VALENCIANA: 1♂, 3♀, Puente Alta, río Turia, Calles (Valencia), 350 m, 30SXJ712959, 18.VI.2011. – 1♀, Fuente Chirrichana, Cofrentes (Valencia), 630 m, 30SXJ661506, 17.VI.2011. – 2♀, La Cabezuela, Cortes de Pallas (Valencia), 700 m, 30SXJ688538, 17.VI.2011. – 3♀, Presa Contreras, río Cabriel, Villargordo (Valencia), 680 m, 30SXJ288777, 19.VI.2011. – 1♀, Ayora (Valencia), 680 m, 30SXJ6526, 17.V.2009. – 3♀, Aielo de Malferit (Valencia), 260 m, 30SYJ0905, 7.VI.2008. – 1♀, Sot de Chera (Valencia), 440 m, 30SXJ765868, 18.VI.2011. – 1♀, Fuen Vich, Requena (Valencia), 640 m, 30SXJ674578, 17.VI.2011." | |
,"EXTREMADURA: 1♀, Fuente del Arco, Sierra Jayona (Badajoz), 745 m, 30STH4526, 16.V.2010. – 1♀, Puerto Peña, río Guadiana (Badajoz), 330 m, 30SUJ1134, 30.V.2009. – 2♀, Casas del Monte, Valle Ambroz (Cáceres), 625 m, 30TTK483547, 20.VI.2015. – 1♂, Nuñomoral (Cáceres), 16.VII.1972, J. Suárez leg., EEZA." | |
,"MURCIA: 1♂, Las Alquerías, Sierra Espuña (Murcia), 800 m, 30SXG2587, 3.VIII.1991. – 1♀, Río Argos, Calasparra (Murcia), 502 m, 30SXH0926, 2.IV.2006. – 3♀, Peñarrubia, Sierra Cambrón (o Ponce) (Murcia), 1000 m, 30SXG1694, 8.V.1994. – 1♀, La Alberca (Murcia), 85 m, 27.II.2014, 30SXH640005, J.A. Sánchez et al. leg., IMIDA." | |
,"ANDALUCÍA: 1♀, Benizalón, Sierra de Filabres (Almería), 780 m, 30SWG6617, 2.VI.1992. – 1♂, Macián, Vélez Blanco (Almería), 30SWG6893, 1200 m, 19.VII.2003. – 1♀, Cuevas de Almanzora (Almería), 30SXG02, 27.V-2.VI.1992, J.E. Belda leg. – 1♀, Ermita Virgen de la Cabeza, Sierra de María, María (Almería), 1375 m, 30SWG7271, 19.VII.2009. – 1♀, Berja, Sierra de Gádor, Berja (Almería), 500 m, 30SWF0679, 10.V.2006. – 1♀, El Ejido (Almería), 30SWF1870. – 1♀, Rambla Nogalte, Vélez Rubio (Almería), 900 m, 30SWG8662, 4.VIII.1991. – 1♂, Pozo de Don Juan, Vélez Blanco (Almería), 1180 m, 30SWG6987, 29.VII.2006. – 1♂, 1♀, Topares, Vélez Blanco (Almería), 1200 m, 30SWG6891, 30.VII.2006. – 1♀, Barranco de la Verruga, Sierra de Filabres, Gérgal (Almería), 1780 m, 30SWG3620, 20.IX.1991. – 1♀, Cerro Negro, Las Negras, Cabo de Gata, Nïjar (Almería), 50 m, 30SWF8982, 28.V.2005. – 1♀, Río Tavizna, Sierra de Grazalema, Benaocaz (Cádiz), , 300 m, 30STF792680, 8.VII.2011. – 1♀, Béznar (Granada), 30SVF5387, 24.V.1987. – 1♀, Válor, Sierra Nevada (Granada), 30SVF9494, 17.IV.1988. – 1♀, Trevélez, Sierra Nevada (Granada), 1550 m, 30SVF7797, 7.VIII.2004. – 1♀, Padul, Valle de Lecrín (Granada), 30SVF4496, 25.V.1985. J.A. Salas leg. – 1♀, Íllora (Granada), 3.V.1982, J.A. Cuadrado leg. – 1♀, Laroles, Sierra Nevada (Granada), 1200 m, 30SVF9897, 16.IV.1988. – 1♀, Huétor Santillán (Granada), 1300 m, 30SVG6028, 29.VI.1988. – 3♀, Laguna de Aguas Verdes, Sierra Nevada, Capileira (Granada), 3070 m, 30SVG6700, 20.VIII.2006,. – 1♀, Lavaderos de la Reina, Sierra Nevada, Güéjar-Sierra (Granada), 2500 m, 30SVG7509, 11.VIII.2006. – 1♀, Lagunas de Río Seco, Sierra Nevada, Capileira (Granada), 3045 m, 30SVG6900, 20.VIII.2006. – 2♀, Prado Negro, Sierra de Harana, Huétor Santillán (Granada), 1500 m, 30SVG5931, 13.V.2007. – 1♀, Cerro de las Pipas, Sierra Nevada, Monachil (Granada), 1400 m, 30SVG5304, 10.V.1994. – 1♀, Alto El Purche, Sierra Nevada, Monachil (Granada), 1460 m, 30SVG5709, 5.VII.1994. – 1♀, Peñones de San Francisco, Sierra Nevada (Granada), 2500 m, 30SVG6505, 1.VIII.2008. – 1♂, 1♀, Fuente la Ponderosa, Sierra de Cazorla, Pozo-Alcón (Jaén), 1365 m, 30SWG0179, 11.VII.2010. – 1♂, 3♀, Puerto de las Palomas, Sierra de Cazorla, La Iruela (Jaén), 1250 m, 30SWG0599, 9.VII.2010. – 1♂, Los Arenales, Sierra de Cazorla, Cazorla (Jaén), 1535 m, 30SWG1395, 10.VII.2010. – 1♀, Nava de San Pedro, Sierra de Cazorla, Cazorla (Jaén), 1300 m, 30SWG1093, 10.VII.2010. – 1♀, Cañada de las Fuentes, Sierra de Cazorla, Quesada (Jaén), 1455 m, 30SWG0288, 11.VII.2010. – 2♀, Riogazas, Sierra de Cazorla, Cazorla (Jaén), 1180 m, 30SWG0093, 10.VII.2010. – 1♀, Monte Cabañas, Sierra de Cazorla (Jaén), 1900 m, 30SWG0385, 27.VI.1994. – 1♀, Villargordo (Jaén), 365 m, 30SVH233024, 7.V.1993. J.M. Cañas leg. – 2♀, Barranco Canal Escalón, Sierra de Cazorla (Jaén), 1600 m, 30SWG0282, 27.VI.1994. – 1♂, Las Conejeras, Sierra de las Nieves, Parauta (Málaga), 1060 m, 30SUF1259, 7.VII.2011. – 1♂, Sierra Bermeja (Málaga), verano 2007, J. Quiñones leg. ") | |
H.crenicornis <- parse_records(text2) | |
H.crenicornis | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment