Last active
October 23, 2017 18:59
-
-
Save briatte/a665f6bb08439e9d0412 to your computer and use it in GitHub Desktop.
manipulate full names with stringr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
test = read_html("http://www.parlament.gv.at/WWER/NR") %>% | |
html_node(xpath = "//a[contains(text(), 'anzeigen')]") %>% | |
html_attr("href") %>% | |
paste0("http://www.parlament.gv.at", .) %>% | |
read_html %>% | |
html_nodes(".tabelle td a") %>% | |
html_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Clean spaces | |
str_space <- function(x) { | |
gsub("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ", x) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Adjust punctuation | |
#' @examples | |
#' str_punct("a, b, c") # not modified | |
#' str_punct("a,,b .c") # single comma, fixed spacing before period | |
#' str_punct("!a--bc;") # single dash, no punctation at start/end | |
str_punct <- function(x, preserve = c(",", "'", "-", "\\."), | |
space = c(",", ";", "\\.", "?", "!")) { | |
# punctation | |
p = c("!", "\\", "#", "$", "%", "&", "'", "\\(", "\\)", "\\*", "\\+", ",", | |
"-", "\\.", "/", ":", ";", "<", "=", ">", "?", "@", "\\[" , "\\\\", | |
"\\]", "\\^", "_", "`", "\\{", "\\|", "\\}", "~", "\\.") | |
# single punctation marks | |
for (i in p[ !p %in% preserve ]) { | |
x = gsub(paste0("(\\s*", i, "\\s*)+"), "", x) | |
} | |
# add missing spaces for some reserved marks | |
for (i in space[ space %in% preserve ]) { | |
x = gsub(paste0("(\\s*", i, "\\s*)+"), paste0(i, " "), x) | |
} | |
# no space before other preserved characters | |
for(i in preserve[ !preserve %in% space ]) { | |
x = gsub(paste0("\\s*", i, "+\\s*"), i, x) | |
} | |
# no start/end punctuation | |
x = gsub("^(\\s|[[:punct:]])+|(\\s|[[:punct:]])+$", "", x) | |
x | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Remove prefixes or suffixes | |
#' @param sep character separator, defaults to "," | |
#' @param side where to look, "left" (default) or "right" | |
#' @param greedy defaults to TRUE | |
#' @examples | |
#' ex = c("a", "a, b", "a, b, c", "a, b, c, d") | |
#' str_filter(ex, side = "left" , greedy = TRUE) | |
#' str_filter(ex, side = "right" , greedy = TRUE) | |
#' str_filter(ex, side = "left" , greedy = FALSE) | |
#' str_filter(ex, side = "right" , greedy = FALSE) | |
str_filter <- function(x, sep = ",", side = "left", greedy = TRUE) { | |
gsub(switch(side, | |
left = c(ifelse(greedy, "", "?"), ")", sep, "\\s*"), | |
right = c(ifelse(greedy, "?", ""), ")\\s*", sep)) %>% | |
c("(.*", ., "(.*)") %>% | |
paste0(collapse = ""), | |
switch(side, left = "\\2", right = "\\1"), x) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = "A. B. Jon Example, C, D" | |
str_filter(x, sep = "\\.", side ="left", greedy = FALSE) | |
# [1] "B. Jon Example, C, D" | |
str_filter(x, sep = "\\.", side ="left", greedy = TRUE) | |
# [1] "Jon Example, C, D" | |
str_filter(x, sep = ",", side ="right", greedy = FALSE) | |
# [1] "A. B. Jon Example, C" | |
str_filter(x, sep = ",", side ="right", greedy = TRUE) | |
# [1] "A. B. Jon Example" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Detach prefixes or suffixes | |
#' @param ... arguments to \code{str_filter} | |
#' @examples | |
#' ex = c("a", "a, b", "a, b, c", "a, b, c, d") | |
#' str_detach(ex, side = "left" , greedy = TRUE) | |
#' str_detach(ex, side = "right" , greedy = TRUE) | |
#' str_detach(ex, side = "left" , greedy = FALSE) | |
#' str_detach(ex, side = "right" , greedy = FALSE) | |
str_detach <- function(x, sep = ",", side = "left", greedy = TRUE) { | |
y = str_filter(x, sep, side, greedy) %>% sapply(nchar) | |
x[ y > 0 ] = lapply(x[ y > 0 ], | |
function(x, regex = str_filter(x, sep, side, greedy)) { | |
y = c( | |
switch(side, | |
left = str_replace(x, regex, "") %>% | |
str_replace(str_c(sep, "\\s*$"), ""), | |
right = NULL), | |
regex, | |
switch(side, | |
left = NULL, | |
right = str_replace(x, regex, "") %>% | |
str_replace(str_c("^", sep, "\\s*"), "")) | |
) | |
ifelse(!nchar(y), NA, y) | |
}) | |
sapply(x, function(x){ x[ switch(side, left = 1, right = 2 )]}) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = "A. B. Jon Example, C, D" | |
str_detach(x, sep = "\\.", side ="left", greedy = FALSE) | |
# [1] "A" | |
str_detach(x, sep = "\\.", side ="left", greedy = TRUE) | |
# [1] "A. B" | |
str_detach(x, sep = ",", side ="right", greedy = FALSE) | |
# [1] "D" | |
str_detach(x, sep = ",", side ="right", greedy = TRUE) | |
# [1] "C, D" | |
str_detach("Jon Example", side = "left") | |
# [1] NA | |
str_detach("Jon Example", side = "right") | |
# [1] NA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
str_clean <- function(x) { | |
x = str_space(x) %>% | |
str_punct | |
data.frame( | |
prefix = str_filter(x, side = "right", sep = ",") %>% | |
str_detach(side = "left", sep = "\\.") %>% | |
str_punct(preserve = c(",", "-")), | |
name = str_filter(x, side = "right", sep = ",") %>% | |
str_filter(side = "left", sep = "\\."), | |
suffix = str_detach(x, sep = ",", side = "right") %>% | |
str_replace("\\.\\s", ", ") %>% | |
str_punct(preserve = c(",", "-")), | |
stringsAsFactors = FALSE | |
) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
test = test[ -1:-3] %>% | |
cbind(original = ., str_clean(.)) %>% | |
# move prefix to name (row #73) | |
mutate(name = ifelse(name == "", prefix, name), | |
prefix = ifelse(prefix == name, NA, prefix)) %>% | |
# drop empty prefix column | |
select(-prefix) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Put first word last | |
str_invert <- function(x, size = 1) { | |
sapply(x, function(x) { | |
x = str_split(x, "\\s") %>% unlist | |
str_c(str_c(x[ -size ], collapse = " "), x[ size ], sep = " ") | |
}) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
subset(data.frame(from = test$name, | |
to = str_replace(test$name, "El\\s", "El_") %>% | |
str_invert %>% | |
str_replace("_", " ")), | |
str_count(to, "\\s") > 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test = select(-original) %>% | |
# invert names, protecting a prefix | |
mutate(test, name = str_replace(name, "El\\s", "El_") %>% | |
str_invert %>% | |
str_replace("_", " ")) %>% | |
# find and number duplicate names | |
group_by(name) %>% | |
mutate(n = n(), o = 1:n()) %>% | |
group_by() %>% | |
mutate(name = ifelse(n > 1, str_c(name, "-", o), name)) %>% | |
select(-n, -o) | |
# sanity check: no duplicates | |
rownames(test) = test$name | |
# or, alternately | |
stopifnot(!duplicated(test$name)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# test = c("Doris Bures", "Karlheinz Kopf", "Ing. Norbert Hofer", "\r\n\t\t\r\nAlm Nikolaus, Mag.\r\n", | |
# "\r\n\t\t\r\nAmon Werner, MBA\r\n", "\r\n\t\t\r\nAngerer Erwin\r\n", | |
# "\r\n\t\t\r\nAntoni Konrad\r\n", "\r\n\t\t\r\nAslan Aygül Berivan, Mag.\r\n", | |
# "\r\n\t\t\r\nAubauer Gertrude, Mag.\r\n", "\r\n\t\t\r\nAuer Jakob\r\n", | |
# "\r\n\t\t\r\nBacher Walter\r\n", "\r\n\t\t\r\nBayr Petra, MA\r\n", | |
# "\r\n\t\t\r\nBecher Ruth, Mag.\r\n", "\r\n\t\t\r\nBelakowitsch-Jenewein Dagmar, Dr.\r\n", | |
# "\r\n\t\t\r\nBerlakovich Nikolaus, Dipl.-Ing.\r\n", "\r\n\t\t\r\nBösch Reinhard Eugen, Dr.\r\n", | |
# "\r\n\t\t\r\nBrosz Dieter, MSc\r\n", "\r\n\t\t\r\nBrückl Hermann\r\n", | |
# "\r\n\t\t\r\nBrunner Christiane, Mag.\r\n", "\r\n\t\t\r\nBuchmayr Harry\r\n", | |
# "\r\n\t\t\r\nBures Doris\r\n", "\r\n\t\t\r\nCap Josef, Dr.\r\n", | |
# "\r\n\t\t\r\nDarmann Gernot, Mag.\r\n", "\r\n\t\t\r\nDeimek Gerhard, Dipl.-Ing.\r\n", | |
# "\r\n\t\t\r\nDiesner-Wais Martina\r\n", "\r\n\t\t\r\nDietrich Waltraud, Ing.\r\n", | |
# "\r\n\t\t\r\nDoppler Rupert\r\n", "\r\n\t\t\r\nDurchschlag Claudia\r\n", | |
# "\r\n\t\t\r\nEcker Cornelia\r\n", "\r\n\t\t\r\nEhmann Michael\r\n", | |
# "\r\n\t\t\r\nEl Habbassi Asdin, BA\r\n", "\r\n\t\t\r\nErtlschweiger Rouven, MSc\r\n", | |
# "\r\n\t\t\r\nEßl Franz Leonhard\r\n", "\r\n\t\t\r\nFazekas Hannes\r\n", | |
# "\r\n\t\t\r\nFeichtinger Klaus Uwe, Mag. Dr.\r\n", "\r\n\t\t\r\nFekter Maria Theresia, Mag. Dr.\r\n", | |
# "\r\n\t\t\r\nFichtinger Angela\r\n", "\r\n\t\t\r\nFranz Marcus, Dr.\r\n", | |
# "\r\n\t\t\r\nFuchs Hubert, MMag. DDr.\r\n", "\r\n\t\t\r\nGahr Hermann\r\n", | |
# "\r\n\t\t\r\nGamon Claudia Angela, MSc (WU)\r\n", "\r\n\t\t\r\nGerstl Wolfgang, Mag.\r\n", | |
# "\r\n\t\t\r\nGessl-Ranftl Andrea\r\n", "\r\n\t\t\r\nGlawischnig-Piesczek Eva, Dr.\r\n", | |
# "\r\n\t\t\r\nGreiner Karin, Mag.\r\n", "\r\n\t\t\r\nGrillitsch Fritz\r\n", | |
# "\r\n\t\t\r\nGroiß Werner, Ing. Mag.\r\n", "\r\n\t\t\r\nGrossmann Elisabeth, Mag.\r\n", | |
# "\r\n\t\t\r\nGusenbauer-Jäger Marianne\r\n", "\r\n\t\t\r\nHable Rainer, Dr.\r\n", | |
# "\r\n\t\t\r\nHackl Heinz-Peter, Ing.\r\n", "\r\n\t\t\r\nHafenecker Christian, MA\r\n", | |
# "\r\n\t\t\r\nHagen Christoph\r\n", "\r\n\t\t\r\nHaider Roman, Mag.\r\n", | |
# "\r\n\t\t\r\nHakel Elisabeth\r\n", "\r\n\t\t\r\nHammer Michael, Mag.\r\n", | |
# "\r\n\t\t\r\nHanger Andreas, Mag.\r\n", "\r\n\t\t\r\nHaubner Peter\r\n", | |
# "\r\n\t\t\r\nHauser Gerald, Mag.\r\n", "\r\n\t\t\r\nHechtl Johann\r\n", | |
# "\r\n\t\t\r\nHeinzl Anton\r\n", "\r\n\t\t\r\nHell Johann\r\n", | |
# "\r\n\t\t\r\nHimmelbauer Eva-Maria, BSc\r\n", "\r\n\t\t\r\nHöbart Christian, Ing.\r\n", | |
# "\r\n\t\t\r\nHofer Norbert, Ing.\r\n", "\r\n\t\t\r\nHofinger Manfred, Ing.\r\n", | |
# "\r\n\t\t\r\nHöfinger Johann\r\n", "\r\n\t\t\r\nHolzinger-Vogtenhuber Daniela, BA\r\n", | |
# "\r\n\t\t\r\nHuainigg Franz-Joseph, Dr.\r\n", "\r\n\t\t\r\nHübner Johannes, Dr.\r\n", | |
# "\r\n\t\t\r\nJank Brigitte\r\n", "\r\n\t\t\r\nJannach Harald\r\n", | |
# "\r\n\t\t\r\nJarmer Helene, Mag.\r\n", "\r\n\t\t\r\nJarolim Johannes, Dr.\r\n", | |
# "\r\n\t\t\r\nKarl Beatrix, Mag. Dr.\r\n", "\r\n\t\t\r\nKarlsböck Andreas F., Dr.\r\n", | |
# "\r\n\t\t\r\nKassegger Axel, MMMag. Dr.\r\n", "\r\n\t\t\r\nKatzian Wolfgang\r\n", | |
# "\r\n\t\t\r\nKeck Dietmar\r\n", "\r\n\t\t\r\nKickl Herbert\r\n", | |
# "\r\n\t\t\r\nKirchgatterer Franz\r\n", "\r\n\t\t\r\nKitzmüller Anneliese\r\n", | |
# "\r\n\t\t\r\nKnes Wolfgang\r\n", "\r\n\t\t\r\nKöchl Matthias\r\n", | |
# "\r\n\t\t\r\nKogler Werner, Mag.\r\n", "\r\n\t\t\r\nKönigsberger-Ludwig Ulrike\r\n", | |
# "\r\n\t\t\r\nKopf Karlheinz\r\n", "\r\n\t\t\r\nKorun Alev, Mag.\r\n", | |
# "\r\n\t\t\r\nKrainer Kai Jan\r\n", "\r\n\t\t\r\nKrist Hermann\r\n", | |
# "\r\n\t\t\r\nKucharowits Katharina\r\n", "\r\n\t\t\r\nKucher Philip\r\n", | |
# "\r\n\t\t\r\nKumpitsch Günther, Mag.\r\n", "\r\n\t\t\r\nKuntzl Andrea, Mag.\r\n", | |
# "\r\n\t\t\r\nLausch Christian\r\n", "\r\n\t\t\r\nLettenbichler Josef, Mag.\r\n", | |
# "\r\n\t\t\r\nLichtenecker Ruperta, Dr.\r\n", "\r\n\t\t\r\nLintl Jessi, Dr.\r\n", | |
# "\r\n\t\t\r\nLipitsch Hermann\r\n", "\r\n\t\t\r\nLoacker Gerald, Mag.\r\n", | |
# "\r\n\t\t\r\nLopatka Reinhold, Dr.\r\n", "\r\n\t\t\r\nLueger Angela\r\n", | |
# "\r\n\t\t\r\nLugar Robert, Ing.\r\n", "\r\n\t\t\r\nMatznetter Christoph, Dr.\r\n", | |
# "\r\n\t\t\r\nMaurer Sigrid\r\n", "\r\n\t\t\r\nMayer Elmar\r\n", | |
# "\r\n\t\t\r\nMölzer Wendelin\r\n", "\r\n\t\t\r\nMoser Gabriela, Dr.\r\n", | |
# "\r\n\t\t\r\nMuchitsch Josef\r\n", "\r\n\t\t\r\nMückstein Eva, Dr.\r\n", | |
# "\r\n\t\t\r\nMühlberghuber Edith\r\n", "\r\n\t\t\r\nMusiol Daniela, Mag.\r\n", | |
# "\r\n\t\t\r\nMuttonen Christine, Mag.\r\n", "\r\n\t\t\r\nNachbaur Kathrin, Dr.\r\n", | |
# "\r\n\t\t\r\nNeubauer Werner\r\n", "\r\n\t\t\r\nObernosterer Gabriel\r\n", | |
# "\r\n\t\t\r\nOfenauer Friedrich, Mag.\r\n", "\r\n\t\t\r\nOttenschläger Andreas\r\n", | |
# "\r\n\t\t\r\nPendl Otto\r\n", "\r\n\t\t\r\nPfurtscheller Elisabeth, Dipl.-Kffr. (FH)\r\n", | |
# "\r\n\t\t\r\nPilz Peter, Dr.\r\n", "\r\n\t\t\r\nPirklhuber Wolfgang, Dipl.-Ing. Dr.\r\n", | |
# "\r\n\t\t\r\nPlessl Rudolf\r\n", "\r\n\t\t\r\nPock Michael\r\n", | |
# "\r\n\t\t\r\nPreiner Erwin\r\n", "\r\n\t\t\r\nPrinz Nikolaus\r\n", | |
# "\r\n\t\t\r\nRädler Johann\r\n", "\r\n\t\t\r\nRasinger Erwin, Dr.\r\n", | |
# "\r\n\t\t\r\nRauch Johannes, Mag.\r\n", "\r\n\t\t\r\nRauch Walter\r\n", | |
# "\r\n\t\t\r\nRiemer Josef A.\r\n", "\r\n\t\t\r\nRosenkranz Walter, Dr.\r\n", | |
# "\r\n\t\t\r\nRosenkranz Barbara\r\n", "\r\n\t\t\r\nRossmann Bruno, Mag.\r\n", | |
# "\r\n\t\t\r\nSchabhüttl Jürgen\r\n", "\r\n\t\t\r\nSchatz Birgit, Mag.\r\n", | |
# "\r\n\t\t\r\nSchellenbacher Thomas, Ing.\r\n", "\r\n\t\t\r\nSchellhorn Josef\r\n", | |
# "\r\n\t\t\r\nSchenk Martina\r\n", "\r\n\t\t\r\nScherak Nikolaus, Dr.\r\n", | |
# "\r\n\t\t\r\nSchieder Andreas, Mag.\r\n", "\r\n\t\t\r\nSchimanek Carmen\r\n", | |
# "\r\n\t\t\r\nSchittenhelm Dorothea\r\n", "\r\n\t\t\r\nSchmid Julian, BA\r\n", | |
# "\r\n\t\t\r\nSchmid Gerhard\r\n", "\r\n\t\t\r\nSchmuckenschlager Johannes\r\n", | |
# "\r\n\t\t\r\nSchönegger Bernd, Mag.\r\n", "\r\n\t\t\r\nSchopf Walter\r\n", | |
# "\r\n\t\t\r\nSchrangl Philipp, Mag.\r\n", "\r\n\t\t\r\nSchultes Hermann, Ing.\r\n", | |
# "\r\n\t\t\r\nSchwentner Judith, Mag.\r\n", "\r\n\t\t\r\nSieber Norbert\r\n", | |
# "\r\n\t\t\r\nSinger Johann\r\n", "\r\n\t\t\r\nSpindelberger Erwin\r\n", | |
# "\r\n\t\t\r\nStefan Harald, Mag.\r\n", "\r\n\t\t\r\nSteger Petra\r\n", | |
# "\r\n\t\t\r\nSteinacker Michaela, Mag.\r\n", "\r\n\t\t\r\nSteinbichler Leopold\r\n", | |
# "\r\n\t\t\r\nSteinhauser Albert, Mag.\r\n", "\r\n\t\t\r\nStrache Heinz-Christian\r\n", | |
# "\r\n\t\t\r\nStrasser Georg, Dipl.-Ing.\r\n", "\r\n\t\t\r\nStrolz Matthias, Mag. Dr.\r\n", | |
# "\r\n\t\t\r\nTamandl Gabriele\r\n", "\r\n\t\t\r\nThemessl Bernhard\r\n", | |
# "\r\n\t\t\r\nTöchterle Karlheinz, Dr.\r\n", "\r\n\t\t\r\nTroch Harald, Dr.\r\n", | |
# "\r\n\t\t\r\nUnterrainer Maximilian, Mag.\r\n", "\r\n\t\t\r\nVavrik Christoph, Mag.\r\n", | |
# "\r\n\t\t\r\nVetter Georg, Dr.\r\n", "\r\n\t\t\r\nVogl Markus, Ing.\r\n", | |
# "\r\n\t\t\r\nWalser Harald, Dr.\r\n", "\r\n\t\t\r\nWeigerstorfer Ulrike\r\n", | |
# "\r\n\t\t\r\nWeninger Hannes\r\n", "\r\n\t\t\r\nWilli Georg\r\n", | |
# "\r\n\t\t\r\nWimmer Rainer\r\n", "\r\n\t\t\r\nWindbüchler-Souschill Tanja\r\n", | |
# "\r\n\t\t\r\nWinter Susanne, Dr.\r\n", "\r\n\t\t\r\nWinzig Angelika, Dr.\r\n", | |
# "\r\n\t\t\r\nWittmann Peter, Dr.\r\n", "\r\n\t\t\r\nWöginger August\r\n", | |
# "\r\n\t\t\r\nWurm Gisela, Mag.\r\n", "\r\n\t\t\r\nWurm Peter\r\n", | |
# "\r\n\t\t\r\nYilmaz Nurten\r\n", "\r\n\t\t\r\nZakostelsky Andreas, Mag.\r\n", | |
# "\r\n\t\t\r\nZanger Wolfgang\r\n", "\r\n\t\t\r\nZinggl Wolfgang, Mag. Dr.\r\n" | |
# ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment