Last active
August 16, 2019 08:05
-
-
Save artemklevtsov/299a452617185ad1325c6b98746f342c to your computer and use it in GitHub Desktop.
Rcpp CRLF string split
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// [[Rcpp::plugins(cpp17)]] | |
#include <Rcpp.h> | |
#include <regex> | |
// [[Rcpp::export(rng=false)]] | |
Rcpp::CharacterVector str_split1(const char* s) { | |
Rcpp::CharacterVector out; | |
std::regex re("\r\n"); | |
auto it = std::cregex_token_iterator(s, s + std::strlen(s), re, -1); | |
auto end = std::cregex_token_iterator(); | |
while(it != end) { | |
out.push_back(*it); | |
++it; | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
Rcpp::CharacterVector str_split2(const char* s) { | |
std::istringstream ss(s); | |
std::string line; | |
Rcpp::CharacterVector out; | |
while (std::getline(ss, line)) { | |
if (*line.rbegin() == '\r') { | |
line.erase(line.size() - 1); | |
} | |
if (line.empty()) { | |
continue; | |
} | |
out.push_back(line); | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
Rcpp::CharacterVector str_split3(const std::string& s) { | |
std::size_t n_chars = s.size(); | |
Rcpp::CharacterVector out; | |
static std::string eol = "\r\n"; | |
std::string::size_type pos = 0, next; | |
while ((next = s.find(eol, pos)) != std::string::npos) { | |
std::string line = s.substr(pos, next - pos); | |
pos = next + eol.size(); | |
if (line.empty()) { | |
continue; | |
} | |
out.push_back(line); | |
} | |
if (pos < n_chars) { | |
out.push_back(s.substr(pos, n_chars - pos)); | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
Rcpp::CharacterVector str_split4(const char* s) { | |
std::size_t n_chars = std::strlen(s); | |
Rcpp::CharacterVector out; | |
std::string_view sv = s; | |
static std::string eol = "\r\n"; | |
std::string::size_type pos = 0, next; | |
while ((next = sv.find(eol, pos)) != std::string::npos) { | |
std::string_view line = sv.substr(pos, next - pos); | |
pos = next + eol.size(); | |
if (line.empty()) { | |
continue; | |
} | |
out.push_back(line); | |
} | |
if (pos < n_chars) { | |
out.push_back(sv.substr(pos, next - pos)); | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
std::vector<std::string> str_split5(const std::string& s) { | |
std::size_t n_chars = s.size(); | |
std::vector<std::string> out; | |
static std::string eol = "\r\n"; | |
std::string::size_type pos = 0, next; | |
while ((next = s.find(eol, pos)) != std::string::npos) { | |
std::string line = s.substr(pos, next - pos); | |
pos = next + eol.size(); | |
if (line.empty()) { | |
continue; | |
} | |
out.push_back(line); | |
} | |
if (pos < n_chars) { | |
out.push_back(s.substr(pos, n_chars - pos)); | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
std::vector<std::string> str_split6(const char* s) { | |
std::size_t n_chars = std::strlen(s); | |
std::vector<std::string> out; | |
std::string_view sv = s; | |
static std::string eol = "\r\n"; | |
std::string::size_type pos = 0, next; | |
while ((next = sv.find(eol, pos)) != std::string::npos) { | |
std::string_view line = sv.substr(pos, next - pos); | |
pos = next + eol.size(); | |
if (line.empty()) { | |
continue; | |
} | |
out.emplace_back(line); | |
} | |
if (pos < n_chars) { | |
out.emplace_back(sv.substr(pos, next - pos)); | |
} | |
return out; | |
} | |
// [[Rcpp::export(rng=false)]] | |
std::vector<std::string> str_split7(const char* s) { | |
std::istringstream ss(s); | |
std::string line; | |
std::vector<std::string> out; | |
while (std::getline(ss, line)) { | |
if (*line.rbegin() == '\r') { | |
line.erase(line.size() - 1); | |
} | |
if (line.empty()) { | |
continue; | |
} | |
out.push_back(line); | |
} | |
return out; | |
} | |
/***R | |
n = 200 | |
x = paste(seq_len(n), rep("TEST TEST TEST", n), collapse = "\r\n") | |
bench::mark( | |
strsplit(x, "\r\n", TRUE)[[1L]], | |
strsplit(x, "\r\n", FALSE)[[1L]], | |
strsplit(x, "\r\n", FALSE, TRUE)[[1L]], | |
str_split1(x), | |
str_split2(x), | |
str_split3(x), | |
str_split4(x), | |
str_split5(x), | |
str_split6(x), | |
str_split7(x), | |
iterations = 5000 | |
) | |
*/ |
Author
artemklevtsov
commented
Aug 16, 2019
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment