Created
May 29, 2018 01:41
-
-
Save tiernanmartin/54f6d839dd8de8f75fd966e8e5b65bad to your computer and use it in GitHub Desktop.
Targeting Unicode Characters with Regular Expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SETUP ---- | |
library(tidyverse) | |
# LOAD DATA ---- | |
csv_url <- "https://data.kingcounty.gov/resource/es38-6nrz.csv" | |
agencies <- read_csv(csv_url) | |
# CLEAN DATA ---- | |
# First approach (didn't work) | |
zwsp_pattern_first <- "<U+200B>" | |
agencies %>% | |
mutate_all(funs(str_replace_all(.,zwsp_pattern_first,""))) %>% | |
slice(1:5) %>% | |
as.data.frame() | |
# Second approach: convert all UTF-8 to ASCII (worked) | |
agencies %>% | |
mutate_all(funs(iconv(.,'utf-8', 'ascii', sub=''))) %>% | |
slice(1:5) %>% | |
as.data.frame() | |
# Third approach: replace <U+200B> with "" | |
zwsp_pattern_second <- "\\u200b" | |
agencies %>% | |
mutate_all(funs(str_replace_all(.,zwsp_pattern_second,""))) %>% | |
slice(1:5) %>% | |
as.data.frame() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment