Last active
March 31, 2025 17:12
-
-
Save paithiov909/7666e9fe29a62425ebb37c4107d46958 to your computer and use it in GitHub Desktop.
Scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
akita <- jisx0402::municipality |> | |
dplyr::filter( | |
pref_code == "05", | |
is.na(end_date), | |
!stringr::str_ends(name, "郡") | |
) |> | |
dplyr::mutate( | |
muni_code = paste0(pref_code, city_code), | |
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
start_year = lubridate::year(start_date) | |
) |> | |
dplyr::select(muni_code, name, start_year) |> | |
dplyr::arrange(desc(start_year)) | |
year <- max(akita$start_year, na.rm = TRUE) | |
dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |> | |
dplyr::filter(.data$year >= .env$year) |> | |
dplyr::right_join(akita, by = c("code" = "muni_code")) | |
require(ggplot2) | |
dat |> | |
dplyr::filter(name == "秋田市", age == "all") |> | |
ggplot(aes(year, population)) + | |
geom_line(aes(colour = sex)) + | |
scale_y_log10() | |
ga <- jisx0402::jptopography("designated") |> | |
dplyr::filter(stringr::str_starts(muni_code, "05")) |> | |
rlang::as_function( | |
~ dplyr::mutate(., | |
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
area = units::set_units(sf::st_area(.), km^2) | |
) | |
)() |> | |
dplyr::right_join( | |
dplyr::filter(dat, age == "all"), | |
by = c("muni_code" = "code") | |
) |> | |
dplyr::group_by(sex) |> | |
dplyr::mutate( | |
denst = round(population / area), | |
rank = dplyr::percent_rank(denst) * 10 | |
) |> | |
dplyr::ungroup() |> | |
ggplot() + | |
geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) + | |
facet_wrap(~ sex) + | |
labs( | |
title = "Rank of Population Density in Akita, throughout 2006-2023", | |
subtitle = "Year: {frame_time}", | |
caption = paste( | |
"地図データ:「国土数値情報 行政区域データ」(国土交通省)", | |
"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成", | |
sep = "\n" | |
) | |
) + | |
theme_light() + | |
gganimate::transition_time(year) | |
gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer()) | |
gganimate::anim_save("akita-denst1.mp4", path = "Videos") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokyo <- jisx0402::municipality |> | |
dplyr::filter( | |
pref_code == "13", | |
is.na(end_date), | |
!stringr::str_ends(name, "郡|(特別区)|(支庁)") | |
) |> | |
dplyr::mutate( | |
muni_code = paste0(pref_code, city_code), | |
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
start_year = lubridate::year(start_date) | |
) |> | |
dplyr::select(muni_code, name, start_year) |> | |
dplyr::arrange(desc(start_year)) | |
year <- max(tokyo$start_year, na.rm = TRUE) | |
dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |> | |
dplyr::filter(.data$year >= .env$year) |> | |
dplyr::right_join(tokyo, by = c("code" = "muni_code")) | |
require(ggplot2) | |
dat |> | |
dplyr::filter(age == "all", stringr::str_detect(name, "区"), | |
year > 2005L) |> | |
ggplot(aes(year, population)) + | |
geom_line(aes(colour = sex)) + | |
facet_wrap(~ name) + | |
scale_y_log10() | |
ga <- jisx0402::jptopography("all", resolution = 1) |> | |
dplyr::filter(stringr::str_starts(muni_code, "13")) |> | |
rlang::as_function( | |
~ dplyr::mutate(., | |
muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
area = units::set_units(sf::st_area(.), km^2) | |
) | |
)() |> | |
dplyr::right_join( | |
dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")), | |
by = c("muni_code" = "code") | |
) |> | |
dplyr::group_by(sex) |> | |
dplyr::mutate( | |
denst = round(population / area), | |
rank = dplyr::percent_rank(denst) * 10 | |
) |> | |
dplyr::ungroup() |> | |
ggplot() + | |
geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) + | |
facet_wrap(~ sex) + | |
labs( | |
title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023", | |
subtitle = "Year: {frame_time}", | |
caption = paste( | |
"地図データ:「国土数値情報 行政区域データ」(国土交通省)", | |
"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成", | |
sep = "\n" | |
) | |
) + | |
theme_light() + | |
gganimate::transition_time(year, range = c(2006L, 2023L)) | |
gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer()) | |
gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @export | |
accelerate <- function(text = "精神を加速させろ") { | |
r"( | |
∩_∩ | |
/ \ /\ | |
|(゚)=(゚)| | |
| ●_● | | |
/ ヽ | |
r⌒| 〃 ------ ヾ | | |
/ i/ |_二__ノ | |
./ / / ) {text} | |
./ / / // | |
/ ./ / ̄ | |
.ヽ、__./ / ⌒ヽ | |
r / | | |
/ ノ | |
/ / / | |
./ // / | |
/. ./ ./ / | |
i / ./ / | |
i ./ .ノ.^/ | |
i ./ |_/ | |
i / | |
/ / | |
(_/ | |
)" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue() | |
} | |
#' @export | |
qb <- function(text = "わけがわからないよ") { | |
r"( | |
|\ /| | |
|\\ //| | |
: ,> `´ ̄`´ < ′ | |
. V V | |
. i< ● ● >i | |
八 、_,_, 八 {text} | |
. / 个 . _ _ . 个 ', | |
_/ il ,' '. li ',__ | |
)" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue() | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get_lyrics.R ----- | |
#' Scrape lyrics from list | |
#' | |
#' @param df A tibble that comes of \code{get_lyrics_list}. | |
#' @param file String; file name to append lyrircs. | |
#' @param links String; column name of lyrics links. | |
#' @returns `file` is returned invisibly. | |
#' @export | |
#' @examples | |
#' \dontrun{ | |
#' csv_file <- | |
#' aznyan::get_lyrics_list("23729") |> | |
#' aznyan::get_lyrics("23729.csv") | |
#' tbl <- | |
#' readr::read_csv(csv_file, col_names = F, col_types = "cccc___cDn") |> | |
#' dplyr::rename( | |
#' title = X1, | |
#' artist = X2, | |
#' lyricist = X3, | |
#' composer = X4, | |
#' text = X8, | |
#' released = X9, | |
#' page_view = X10 | |
#' ) | |
#' } | |
get_lyrics <- function(df, file, links = "link") { | |
base_url <- "https://www.uta-net.com" | |
links <- dplyr::pull(df, {{ links }}) | |
url <- paste(base_url, links, sep = "/") | |
session <- polite::bow(base_url, force = FALSE) | |
purrr::iwalk(url, function(q, itr) { | |
html <- session %>% | |
polite::nod(q) %>% | |
polite::scrape() | |
lyric_body <- html %>% | |
rvest::html_element("#kashi_area") %>% | |
rvest::html_text2() | |
info <- html %>% | |
rvest::html_element(".song-infoboard") %>% | |
rvest::html_element(".detail") %>% | |
rvest::html_text() %>% | |
stringr::str_split("\\n") %>% | |
unlist() %>% | |
purrr::pluck(4) %>% | |
stringr::str_extract_all(pattern = "[\\d,/]+") %>% | |
unlist() | |
data.frame( | |
df[itr, ], | |
lyric = lyric_body, | |
released = info[1], | |
page_viewed = info[2] | |
) %>% | |
dplyr::mutate( | |
released = lubridate::as_date(.data$released), | |
page_viewed = stringr::str_remove_all(.data$page_viewed, ",") %>% | |
unlist() %>% | |
as.numeric() | |
) %>% | |
readr::write_csv(file, append = TRUE, progress = FALSE) | |
}) | |
invisible(file) | |
} | |
# lyrics_list.R ----- | |
#' Scrape table of lyrics list | |
#' | |
#' @param id String; substring xxx of 'https://www.uta-net.com/:type:/xxx/'. | |
#' @param type String; one of "artist", "lyricist", or "composer". | |
#' @returns tibble. | |
#' @export | |
get_lyrics_list <- function(id, | |
type = c("artist", "lyricist", "composer")) { | |
base_url <- "https://www.uta-net.com" | |
type <- rlang::arg_match(type, c("artist", "lyricist", "composer")) | |
url <- paste(base_url, type, id, "", sep = "/") | |
session <- | |
polite::bow(base_url, force = FALSE) %>% | |
polite::nod(url) | |
html <- session %>% | |
polite::scrape() | |
page_list <- html %>% | |
rvest::html_element(".songlist-table-block") %>% | |
rvest::html_element("tfoot") %>% | |
rvest::html_text() %>% | |
stringr::str_extract("([:number:]+)") | |
purrr::map_dfr(seq.int(as.integer(page_list)), function(i) { | |
html <- session %>% | |
polite::nod(path = paste(url, "0", as.character(i), "", sep = "/")) %>% | |
polite::scrape() | |
tables <- html %>% | |
rvest::html_elements(".songlist-table-block") %>% | |
rvest::html_elements("table") | |
df <- tables %>% | |
rvest::html_table() %>% | |
purrr::map_dfr(~ stats::na.omit(.)) | |
titles <- html %>% | |
rvest::html_elements(".songlist-table-block") %>% | |
rvest::html_elements(".songlist-title") %>% | |
rvest::html_text() | |
links <- tables %>% | |
rvest::html_elements(".sp-w-100") %>% | |
rvest::html_elements("a") %>% | |
rvest::html_attr("href") %>% | |
purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/")) | |
df %>% | |
dplyr::slice_head(n = nrow(df) - 1) %>% | |
dplyr::rename( | |
text_lab = "\u66f2\u540d", | |
artist = "\u6b4c\u624b\u540d", | |
lyricist = "\u4f5c\u8a5e\u8005\u540d", | |
composer = "\u4f5c\u66f2\u8005\u540d", | |
leading = "\u6b4c\u3044\u51fa\u3057" | |
) %>% | |
dplyr::bind_cols( | |
data.frame(title = titles, link = links, source_page = i) | |
) %>% | |
dplyr::select( | |
.data$title, | |
.data$artist, | |
.data$lyricist, | |
.data$composer, | |
.data$leading, | |
.data$link, | |
.data$source_page | |
) | |
}) | |
} | |
#' Search lyrics list by keyword | |
#' | |
#' @param keyword String; search phrase. | |
#' @param sort String; one of "new", "popular", "title", or "artist". | |
#' @return tibble | |
#' @export | |
search_lyrics_list <- function(keyword, | |
sort = c("new", "popular", "title", "artist")) { | |
base_url <- "https://www.uta-net.com" | |
sort <- rlang::arg_match(sort, c("new", "popular", "title", "artist")) | |
sort <- dplyr::case_when( | |
sort == "new" ~ 6, | |
sort == "popular" ~ 4, | |
sort == "title" ~ 1, | |
sort == "artist" ~ 7, | |
TRUE ~ 1 | |
) | |
url <- paste(base_url, "search", "", sep = "/") | |
session <- | |
polite::bow(base_url, force = FALSE) %>% | |
polite::nod(url) | |
html <- session %>% | |
polite::scrape(query = list( | |
Keyword = stringr::str_trim(keyword), | |
Aselect = "2", | |
Bselect = "3", | |
sort = sort | |
)) | |
page_list <- html %>% | |
rvest::html_element("#songlist-sort-paging") %>% | |
rvest::html_text() %>% | |
stringr::str_extract("([:number:]+)") | |
purrr::map_dfr(seq.int(as.integer(page_list)), function(i) { | |
html <- session %>% | |
polite::scrape(query = list( | |
Keyword = enc2utf8(keyword), | |
Aselect = "2", | |
Bselect = "3", | |
sort = sort, | |
pnum = i | |
)) | |
tables <- html %>% | |
rvest::html_elements(".songlist-table-block") %>% | |
rvest::html_elements("table") | |
df <- tables %>% | |
rvest::html_table() %>% | |
purrr::map_dfr(~ na.omit(.)) | |
titles <- html %>% | |
rvest::html_elements(".songlist-table-block") %>% | |
rvest::html_elements(".songlist-title") %>% | |
rvest::html_text() | |
links <- tables %>% | |
rvest::html_elements(".sp-w-100") %>% | |
rvest::html_elements("a") %>% | |
rvest::html_attr("href") %>% | |
purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/")) | |
df %>% | |
dplyr::slice_head(n = nrow(df) - 1) %>% | |
dplyr::rename( | |
text_lab = "\u66f2\u540d", | |
artist = "\u6b4c\u624b\u540d", | |
lyricist = "\u4f5c\u8a5e\u8005\u540d", | |
composer = "\u4f5c\u66f2\u8005\u540d", | |
leading = "\u6b4c\u3044\u51fa\u3057" | |
) %>% | |
dplyr::bind_cols( | |
data.frame(title = titles, link = links, source_page = i) | |
) %>% | |
dplyr::select( | |
.data$title, | |
.data$artist, | |
.data$lyricist, | |
.data$composer, | |
.data$leading, | |
.data$link, | |
.data$source_page | |
) | |
}) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r} | |
#| label: help-text | |
#| echo: false | |
#| output: asis | |
text <- withr::with_options(list(useFancyQuotes = FALSE), | |
capture.output({ | |
tools::Rd2HTML( | |
tools::Rd_db("base")[["use.Rd"]], | |
outputEncoding = "UTF-8" | |
) | |
}) |> | |
paste0(collapse = "\n") |> | |
rvest::read_html() |> | |
rvest::html_nodes("main") |> | |
rvest::html_text() | |
) | |
text |> | |
stringr::str_split("\n") |> | |
unlist(use.names = FALSE) |> | |
rlang::as_function(~ paste("> ", .))() |> | |
commonmark::markdown_commonmark() |> | |
cat() | |
``` |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dat <- | |
readr::read_csv("data/lower_care.csv") |> | |
dplyr::mutate( | |
issue = factor(issue), | |
speaker = factor(speaker), | |
speakerGroup = factor(speakerGroup), | |
speech = stringi::stri_replace_all_regex(speech, "\\s+", "") |> | |
stringi::stri_trans_nfkc() | |
) | |
# 名詞 | |
toks <- dat |> | |
dplyr::select(speechID, speech, speaker, speakerGroup) |> | |
gibasa::tokenize( | |
speech, | |
speechID, | |
sys_dic = here::here("sudachidict/build") | |
) |> | |
gibasa::prettify( | |
into = gibasa::get_dict_features("sudachi"), | |
col_select = c("POS1", "normalized_form", "reading_form") | |
) |> | |
dplyr::mutate( | |
reading_form = stringi::stri_replace_all_regex( | |
reading_form, | |
"[ァィゥェォャュョ]", | |
"" | |
), | |
mora = stringi::stri_length(reading_form), | |
.by = doc_id | |
) | |
noun <- toks |> | |
dplyr::reframe( | |
tok = gibasa::ngram_tokenizer(3)(token, sep = "-"), | |
pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"), | |
.by = doc_id | |
) |> | |
dplyr::filter( | |
pos == "名詞-助詞-名詞", | |
stringi::stri_detect_fixed(tok, "-の-") | |
) |> | |
dplyr::distinct(tok) |> | |
dplyr::pull(tok) |> | |
stringi::stri_extract_first_regex("^([[:alpha:]]+)") | |
meishi <- toks |> | |
dplyr::count(doc_id, token, mora) |> | |
tidytext::bind_tf_idf(token, doc_id, n) |> | |
dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |> | |
dplyr::filter(dplyr::percent_rank(tf_idf) > .80) |> | |
dplyr::distinct(token, mora) | |
readr::write_csv( | |
meishi, | |
"vocab_n.csv" | |
) | |
# 述部 | |
ch <- | |
stringi::stri_split_boundaries( | |
dat$speech, | |
opts_brkiter = stringi::stri_opts_brkiter( | |
locale = "ja@ld=auto;lw=phrase" | |
) | |
) |> | |
purrr::map( | |
~ stringi::stri_subset_regex(., "。") | |
) |> | |
unlist() |> | |
unique() |> | |
stringi::stri_replace_all_fixed("。", "") |> | |
purrr::keep(~ stringi::stri_length(.) > 2) | |
len <- | |
gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |> | |
gibasa::prettify( | |
into = gibasa::get_dict_features("sudachi"), | |
col_select = "reading_form" | |
) |> | |
gibasa::as_tokens(reading_form, pos_field = NULL) |> | |
purrr::map( | |
~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |> | |
stringi::stri_length() |> | |
sum() | |
) |> | |
unlist() | |
jutsubu <- | |
tibble::tibble( | |
word = ch, | |
mora = len | |
) |> | |
dplyr::filter( | |
!gibasa::is_blank(word), | |
mora > 4, mora < 8 | |
) |> | |
dplyr::arrange(mora) | |
readr::write_csv( | |
jutsubu, | |
"vocab_j.csv" | |
) | |
# 修飾部 | |
shushoku <- toks |> | |
dplyr::filter( | |
POS1 %in% c("形容詞") | |
) |> | |
dplyr::distinct(normalized_form, .keep_all = TRUE) |> | |
dplyr::select(token, mora) |> | |
dplyr::arrange(mora) | |
readr::write_csv( | |
shushoku, | |
"vocab_s.csv" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dat <- | |
readxl::read_xlsx( | |
"data/39.xlsx", | |
col_names = c( | |
"title", "artist", "lyricist", "composer", | |
"leading", "link", "source", "text", | |
"release", "view" | |
), | |
na = "NA" | |
) |> | |
dplyr::reframe( | |
text = stringi::stri_split_regex(text, "[\\s]{2,}") |> | |
unlist() |> | |
stringi::stri_replace_all_regex("[\\n]+", "。") |> | |
stringi::stri_replace_all_regex("[「」()\\(\\)]", "") |> | |
stringi::stri_trans_tolower(), | |
.by = link | |
) | |
# 名詞 | |
toks <- dat |> | |
gibasa::tokenize( | |
text, | |
link, | |
sys_dic = here::here("sudachidict/build") | |
) |> | |
gibasa::prettify( | |
into = gibasa::get_dict_features("sudachi"), | |
col_select = c("POS1", "normalized_form", "reading_form") | |
) |> | |
dplyr::mutate( | |
reading_form = stringi::stri_replace_all_regex( | |
reading_form, | |
"[ァィゥェォャュョ]", | |
"" | |
), | |
mora = stringi::stri_length(reading_form), | |
.by = doc_id | |
) | |
noun <- toks |> | |
dplyr::reframe( | |
tok = gibasa::ngram_tokenizer(3)(token, sep = "-"), | |
pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"), | |
.by = doc_id | |
) |> | |
dplyr::filter( | |
pos == "名詞-助詞-名詞", | |
stringi::stri_detect_fixed(tok, "-の-") | |
) |> | |
dplyr::distinct(tok) |> | |
dplyr::pull(tok) |> | |
stringi::stri_extract_first_regex("^([[:alpha:]]+)") | |
meishi <- toks |> | |
dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |> | |
dplyr::distinct(token, mora) | |
readr::write_csv( | |
meishi, | |
"vocab_n.csv" | |
) | |
# 述部 | |
ch <- | |
stringi::stri_split_boundaries( | |
dat$text, | |
opts_brkiter = stringi::stri_opts_brkiter( | |
locale = "ja@ld=auto;lw=phrase" | |
) | |
) |> | |
purrr::map( | |
~ stringi::stri_subset_regex(., "。") | |
) |> | |
unlist() |> | |
unique() |> | |
stringi::stri_replace_all_fixed("。", "") |> | |
purrr::keep(~ stringi::stri_length(.) > 2) | |
len <- | |
gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |> | |
gibasa::prettify( | |
into = gibasa::get_dict_features("sudachi"), | |
col_select = "reading_form" | |
) |> | |
gibasa::as_tokens(reading_form, pos_field = NULL) |> | |
purrr::map( | |
~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |> | |
stringi::stri_length() |> | |
sum() | |
) |> | |
unlist() | |
jutsubu <- | |
tibble::tibble( | |
word = ch, | |
mora = len | |
) |> | |
dplyr::filter( | |
!gibasa::is_blank(word), | |
mora > 2, mora < 8 | |
) |> | |
dplyr::arrange(mora) | |
readr::write_csv( | |
jutsubu, | |
"vocab_j.csv" | |
) | |
# 修飾部 | |
shushoku <- toks |> | |
dplyr::filter( | |
POS1 %in% c("形容詞", "副詞") | |
) |> | |
dplyr::distinct(normalized_form, .keep_all = TRUE) |> | |
dplyr::select(token, mora) |> | |
dplyr::arrange(mora) | |
readr::write_csv( | |
shushoku, | |
"vocab_s.csv" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Environment for internal use | |
#' | |
#' @noRd | |
#' @keywords internal | |
.env <- rlang::env(server i= NULL) | |
#' @noRd | |
reset_encoding <- function(chr, encoding = "UTF-8") { | |
Encoding(chr) <- encoding | |
return(chr) | |
} | |
#' Call lattice command | |
#' | |
#' A debug tool of tokenize process outputs a lattice in graphviz dot format. | |
#' | |
#' @param sentences Character vector. | |
#' @param ... Other arguments are passed to \code{DiagrammeR::grViz}. | |
#' | |
#' @export | |
get_lattice <- function(sentences, ...) { | |
senteces <- stringi::stri_omit_na(sentences) | |
dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout | |
DiagrammeR::grViz(reset_encoding(dot), ...) | |
} | |
#' Lanch Kagome Server | |
#' | |
#' Start or kill Kagome server process. | |
#' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary). | |
#' @return Logical value (whether or not the kagome server is alive?) is returned invisibly. | |
#' | |
#' @export | |
lanch_server <- function(dict = c("ipa", "uni")) { | |
p <- rlang::env_get(.env, "server") | |
if (is.null(p)) { | |
dict <- rlang::arg_match(dict) | |
p <- processx::process$new("kagome", c("server", "-dict", dict)) | |
rlang::env_bind(.env, server = p) | |
} else { | |
if (p$is_alive()) { | |
kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?") | |
} | |
if (kill) { | |
p$kill() | |
rlang::env_bind(.env, server = NULL) | |
} | |
} | |
return(invisible(p$is_alive())) | |
} | |
#' Send a HEAD request to Kagome server | |
#' | |
#' @param url URL Character scalar. | |
#' @return httr2 response is returned invisibly. | |
#' | |
#' @export | |
ping <- function(url = "http://localhost:6060") { | |
resp <- | |
httr2::request(url) |> | |
httr2::req_method("HEAD") |> | |
httr2::req_perform() | |
return(invisible(resp)) | |
} | |
#' Put request to tokenize API | |
#' | |
#' @param sentences Character vector to be analyzed. | |
#' @param url URL of Kagome server. | |
#' @param mode One of `normal`, `search` or `extended`. | |
#' @return tibble | |
#' | |
#' @export | |
tokenize <- function(sentences, | |
url = "http://localhost:6060/tokenize", | |
mode = c("normal", "search", "extended")) { | |
mode <- rlang::arg_match(mode) | |
sentences <- | |
stringi::stri_omit_na(sentences) |> | |
stringi::stri_split_boundaries(type = "sentence") |> | |
purrr::flatten_chr() | |
resps <- | |
furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) |> | |
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) |> | |
dplyr::mutate_at(c("doc_id", "class"), as.factor) |> | |
tibble::as_tibble() | |
return(resps) | |
} | |
#' @noRd | |
tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") { | |
resp <- | |
httr2::request(url) |> | |
httr2::req_body_json(list( | |
sentence = msg, | |
mode = mode | |
)) |> | |
httr2::req_method("PUT") |> | |
httr2::req_error(function(resp) httr2::resp_status(resp) > 400) |> | |
httr2::req_perform() | |
return( | |
purrr::map_dfr( | |
httr2::resp_body_json(resp)$tokens, | |
~ data.frame( | |
doc_id = idx, | |
id = .$id, | |
start = .$start, | |
end = .$end, | |
class = .$class, | |
token = .$surface, | |
POS1 = .$pos[[1]], | |
POS2 = .$pos[[2]], | |
POS3 = .$pos[[3]], | |
POS4 = .$pos[[4]], | |
Original = .$base_form, | |
Yomi1 = .$reading, | |
Yomi2 = .$pronunciation | |
) | |
) | |
) | |
} | |
.on_unload <- function(ns) { | |
p <- rlang::env_get(.env, "server") | |
if (!is.null(p) && p$is_alive()) { | |
p$kill() | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
texts <- ldccr::read_aozora( | |
"https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip", | |
directory = tempdir() | |
) | |
sent <- readr::read_lines(texts) |> | |
gibasa::tokenize(split = TRUE) |> | |
gibasa::prettify() |> | |
dplyr::filter(!POS1 %in% c("記号")) |> | |
dplyr::group_by(doc_id, sentence_id) |> | |
dplyr::reframe( | |
text = paste0(token, collapse = " ") | |
) |> | |
dplyr::mutate( | |
doc_id = doc_id, | |
sentence_id = paste(doc_id, sentence_id, sep = "_") | |
) | |
### lexRankr ---- | |
tok <- sent |> | |
tidytext::unnest_tokens( | |
token, | |
text, | |
token = \(x) strsplit(x, " ", fixed = TRUE) | |
) | |
simil_df <- lexRankr::sentenceSimil( | |
sentenceId = tok$sentence_id, | |
token = tok$token, | |
docId = tok$doc_id | |
) | |
top_n_sent <- | |
lexRankr::lexRankFromSimil( | |
simil_df$sent1, | |
simil_df$sent2, | |
simil = simil_df$similVal, | |
n = 10, | |
usePageRank = TRUE, | |
continuous = TRUE, | |
# threshold = 0.1, | |
returnTies = FALSE | |
) | |
top_n_sent |> | |
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
dplyr::arrange(desc(value)) |> | |
dplyr::as_tibble() | |
### replace sentenceSimil ---- | |
mat <- | |
sent |> | |
tidytext::unnest_tokens( | |
term, | |
text, | |
token = \(x) strsplit(x, " ", fixed = TRUE) | |
) |> | |
dplyr::count(doc_id, term) |> | |
tidytext::bind_tf_idf(term, doc_id, n) |> | |
dplyr::mutate(tf_idf = n * (idf + 1)) |> | |
dplyr::inner_join( | |
sent |> | |
tidytext::unnest_tokens( | |
term, | |
text, | |
token = \(x) strsplit(x, " ", fixed = TRUE) | |
) |> | |
dplyr::select(doc_id, sentence_id, term), | |
by = c("doc_id" = "doc_id", "term" = "term") | |
) |> | |
tidytext::cast_sparse(sentence_id, term, tf_idf) | |
dt <- proxyC::simil( | |
mat, | |
method = "cosine", | |
# min_simil = 0.1, | |
# rank = 50, | |
use_nan = FALSE | |
) | |
dt |> | |
tidytext:::tidy.dfm() |> | |
dplyr::rename( | |
s1 = document, | |
s2 = term, | |
weight = count | |
) |> | |
(\(simil_df) { | |
lexRankr::lexRankFromSimil( | |
simil_df$s1, | |
simil_df$s2, | |
simil = simil_df$weight, | |
n = nrow(simil_df), | |
usePageRank = TRUE, | |
continuous = TRUE, | |
# threshold = 0.1, | |
returnTies = TRUE | |
) | |
})() |> | |
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
dplyr::arrange(desc(value)) |> | |
dplyr::as_tibble() | |
### replace lexRankFromSimil ---- | |
dt |> | |
tidytext:::tidy.dfm() |> | |
dplyr::rename( | |
s1 = document, | |
s2 = term, | |
weight = count | |
) |> | |
(\(simil_df) { | |
pr <- simil_df |> | |
igraph::graph_from_data_frame( | |
directed = FALSE | |
) |> | |
igraph::page_rank( | |
directed = FALSE, | |
damping = .85 | |
) |> | |
purrr::pluck("vector") | |
tibble::tibble( | |
sentenceId = names(pr), | |
value = unname(pr) | |
) | |
})() |> | |
dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
dplyr::arrange(desc(value)) |> | |
dplyr::as_tibble() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
texts <- ldccr::read_aozora( | |
"https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip", | |
directory = tempdir() | |
) | |
sent <- readr::read_lines(texts) |> | |
gibasa::tokenize() |> | |
gibasa::prettify() |> | |
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
dplyr::reframe( | |
token = gibasa::ngram_tokenizer(2)(token, sep = "-"), | |
pos = gibasa::ngram_tokenizer(2)(POS1, sep = "-"), | |
.by = doc_id | |
) |> | |
dplyr::count(doc_id, token, pos) |> | |
gibasa::bind_tf_idf2( | |
token, doc_id, n, | |
tf = "tf2", idf = "idf3", norm = TRUE | |
) |> | |
dplyr::filter(!stringr::str_detect(pos, "(助詞)|(記号)")) |> | |
dplyr::summarise( | |
value = sum(tf_idf) |> round(digits = 2), | |
.by = token | |
) |> | |
tidyr::separate_wider_delim( | |
token, | |
delim = "-", | |
names = c("source", "target") | |
) |> | |
dplyr::slice_max(value, n = 40) | |
nodes <- readr::read_lines(texts) |> | |
gibasa::tokenize() |> | |
gibasa::prettify() |> | |
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
dplyr::filter(token %in% unique(c(sent$source, sent$target))) |> | |
dplyr::count(token) |> | |
dplyr::transmute( | |
id = token, | |
value = n, | |
cluster = sent |> | |
igraph::graph_from_data_frame() |> | |
igraph::cluster_label_prop() |> | |
igraph::membership() |> | |
rlang::as_function(~ .[id])() |> | |
unname() |> | |
factor() |> | |
forcats::fct_lump(n = 5, other_level = "0") |> | |
(\(f) { | |
forcats::fct_relabel(f, | |
~ scales::viridis_pal(alpha = .8, option = "H")(nlevels(f)) | |
) | |
})() | |
) | |
jsonlite::write_json( | |
nodes, | |
file.path(here::here(), "src/components/data/nodes.json") | |
) | |
jsonlite::write_json( | |
sent, | |
file.path(here::here(), "src/components/data/edges.json") | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(sudachir) | |
require(apportita) | |
strj_split_boundaries <- \(x) { | |
stringi::stri_split_boundaries( | |
x, | |
opts_brkiter = stringi::stri_opts_brkiter( | |
locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere | |
) | |
) | |
} | |
split_kugire <- \(x) { | |
strj_split_boundaries(x) |> | |
purrr::map(\(elem) { | |
len <- length(elem) | |
if (len < 2) { | |
return(NA_character_) | |
} else { | |
sapply(seq_len(len - 1), \(i) { | |
s1 <- paste0(elem[1:i], collapse = "") | |
s2 <- paste0(elem[(i + 1):len], collapse = "") | |
paste(s1, s2, sep = "\t") | |
}) | |
} | |
}) |> | |
unlist() | |
} | |
sudachi <- sudachir::rebuild_tokenizer(mode = "C") | |
form <- \(x) { | |
unlist(sudachir::form(x, type = "normalized", pos = FALSE, | |
instance = sudachi)) | |
} | |
wrd <- \(conn, s1, s2) { | |
purrr::map2_dbl(s1, s2, \(el1, el2) { | |
el1 <- form(el1) | |
el2 <- form(el2) | |
apportita::calc_wrd(conn, el1, el2) | |
}) | |
} | |
conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude") | |
dim(conn) | |
### tanka72 ---- | |
dat <- | |
readxl::read_xlsx("data/tanka/tanka72.xlsx") |> | |
dplyr::transmute( | |
id = factor(id), | |
body = audubon::strj_normalize(body) |> | |
stringr::str_remove_all("[^[:alnum:]]+"), | |
author = factor(author) | |
) | |
ret <- dat |> | |
dplyr::reframe( | |
id = id, | |
phrase = split_kugire(body), | |
author = author, | |
.by = id | |
) |> | |
tidyr::separate_wider_delim( | |
phrase, delim = "\t", names = c("s1", "s2") | |
) | |
ret <- ret |> | |
dplyr::mutate( | |
wrd = wrd(conn, s1, s2) | |
) | |
arrow::write_parquet(ret, "tanka72-wrd.parquet") | |
### tweets ---- | |
tweets <- | |
readr::read_csv( | |
"data/shinabitanori-230622.csv.gz", | |
col_names = c("id", "time", "body"), | |
col_types = "ccc" | |
) |> | |
dplyr::filter( | |
!stringr::str_detect(body, "@|(RT)"), | |
stringr::str_length(body) > 14, | |
stringr::str_length(body) < 30 | |
) |> | |
dplyr::slice_sample(n = 100) |> | |
dplyr::transmute( | |
id = factor(id), | |
body = audubon::strj_normalize(body) |> | |
ldccr::clean_url() |> | |
stringr::str_remove_all("[^[:alnum:]]+") | |
) |> | |
dplyr::filter( | |
!stringi::stri_isempty(body) | |
) | |
tweets <- tweets |> | |
dplyr::reframe( | |
id = id, | |
phrase = split_kugire(body), | |
.by = id | |
) |> | |
tidyr::separate_wider_delim( | |
phrase, delim = "\t", names = c("s1", "s2") | |
) | |
tweets <- tweets |> | |
dplyr::mutate( | |
wrd = wrd(conn, s1, s2) | |
) | |
arrow::write_parquet(tweets, "shinabitanori-wrd.parquet") | |
close(conn) | |
rm(conn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(sudachir) | |
require(apportita) | |
strj_split_boundaries <- \(x) { | |
stringi::stri_split_boundaries( | |
x, | |
opts_brkiter = stringi::stri_opts_brkiter( | |
locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere | |
) | |
) | |
} | |
split_kugire <- \(x) { | |
strj_split_boundaries(x) |> | |
purrr::map(\(elem) { | |
len <- length(elem) | |
if (len < 2) { | |
return(NA_character_) | |
} else { | |
sapply(seq_len(len - 1), \(i) { | |
s1 <- paste0(elem[1:i], collapse = "") | |
s2 <- paste0(elem[(i + 1):len], collapse = "") | |
paste(s1, s2, sep = "\t") | |
}) | |
} | |
}) |> | |
unlist() | |
} | |
sudachi <- sudachir::rebuild_tokenizer(mode = "C") | |
form <- \(x) { | |
unlist(sudachir::form(x, type = "normalized", pos = FALSE, | |
instance = sudachi)) | |
} | |
wrd <- \(conn, s1, s2) { | |
purrr::map2_dbl(s1, s2, \(el1, el2) { | |
el1 <- form(el1) | |
el2 <- form(el2) | |
tryCatch( | |
apportita::calc_wrd(conn, el1, el2), | |
error = \(e) { | |
NA | |
} | |
) | |
}, .progress = TRUE) | |
} | |
conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude") | |
dim(conn) | |
### tanka ---- | |
dat <- | |
readxl::read_excel( | |
"data/tanka/1001-1500_poems.xlsx", | |
col_names = c("id", "poems", "names", "loves", "likes", "keys1", "keys2") | |
) |> | |
# dplyr::slice_sample(n = 20) |> | |
dplyr::transmute( | |
id = id, | |
names = stringr::str_remove_all(names, "[(<U\\+[A-Z]>)]") |> | |
audubon::strj_normalize() |> | |
stringr::str_remove_all("[^[:alnum:]]+"), | |
body = stringr::str_remove_all(poems, "[(<U\\+[A-Z]>)]") |> | |
audubon::strj_normalize() |> | |
stringr::str_remove_all("[^[:alnum:]]+") | |
) |> | |
dplyr::filter( | |
!stringi::stri_isempty(body) | |
) | |
dat <- dat |> | |
dplyr::reframe( | |
id = id, | |
phrase = split_kugire(body), | |
.by = id | |
) |> | |
tidyr::separate_wider_delim( | |
phrase, delim = "\t", names = c("s1", "s2") | |
) | |
arrow::write_parquet(dat, "tanka-kugire.parquet") | |
print("区切り処理終わり") | |
dat <- dat |> | |
tidyr::drop_na() |> | |
dplyr::mutate( | |
wrd = wrd(conn, s1, s2) | |
) | |
arrow::write_parquet(dat, "tanka-wrd.parquet") | |
close(conn) | |
rm(conn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clean_url <- \(text) { | |
pat <- "(https?|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?" | |
stringr::str_replace_all(text, pattern = pat, replacement = "URL") | |
} | |
to_df <- \(jsonlist) { | |
purrr::map(jsonlist, \(elem) { | |
text <- elem$rendered_body |> | |
rvest::read_html() |> | |
rvest::html_elements("p") |> | |
rvest::html_text() |> | |
paste(collapse = "\n") | |
tags <- elem$tags |> | |
purrr::map_chr(~ .x$name) |> | |
paste(collapse = ",") | |
tibble::tibble( | |
"doc_id" = elem$id, | |
"created" = elem$created_at, | |
"updated" = elem$updated_at, | |
"author" = elem$user$permanent_id, | |
"title" = audubon::strj_normalize(elem$title), | |
"text" = audubon::strj_normalize(text), | |
"tags" = tags, | |
"comments" = elem$comments_count, | |
"likes" = elem$likes_count, | |
"reactions" = elem$reactions_count, | |
"stocks" = elem$stocks_count) | |
}) |> | |
purrr::list_rbind() |> | |
dplyr::filter( | |
cld3::detect_language(text) == "ja" | |
) |> | |
dplyr::mutate( | |
created = lubridate::as_date(created), | |
updated = lubridate::as_date(updated) | |
) | |
} | |
PAGES <- ceiling(4642 / 100) | |
df <- seq_len(PAGES) |> | |
purrr::map(\(i) { | |
li <- qiitr::qiita_get_items( | |
tag_id = "r", | |
per_page = 100, # range: 1-100 | |
page_offset = (i - 1), # 1 + page_offset < 100 | |
page_limit = 1 | |
) | |
if (i %% 16 == 0) { | |
rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i)) | |
Sys.sleep(60 * 16) | |
} | |
return(to_df(li)) | |
}, .progress = TRUE) |> | |
purrr::list_rbind() | |
arrow::write_parquet(df, here::here("data/qiita.parquet")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tbl <- readxl::read_xlsx("Downloads/koe.xlsx") |> | |
tibble::rowid_to_column(var = "doc_id") |> | |
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "NA"))) |> | |
tidyr::drop_na() | |
tbl |> | |
dplyr::select(Region, Sex, Age, Satis) |> | |
DataExplorer::plot_bar() | |
tbl <- tbl |> | |
dplyr::mutate( | |
doc_id = factor(doc_id), | |
Region = factor(Region), | |
Sex = factor(Sex), | |
Age = stringi::stri_trans_nfkc(Age), | |
Age = dplyr::case_match( | |
Age, | |
c("10代", "20代") ~ "20", | |
"30代" ~ "30", | |
"40代" ~ "40", | |
"50代" ~ "50", | |
c("60代", "70代") ~ "60" | |
), | |
Age = factor(Age), | |
Satis = stringr::str_detect(Satis, "満足"), | |
Opinion = audubon::strj_normalize(Opinion) |> | |
stringr::str_replace_all("[[:number:]]+", "N") | |
) | |
summary(tbl) | |
df <- tbl |> | |
dplyr::select(!Region) |> | |
gibasa::tokenize(Opinion) |> | |
gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |> | |
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
gibasa::mute_tokens( | |
(!POS1 %in% c("名詞", "形容詞")) | | |
(token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行")) | |
) |> | |
gibasa::pack() | |
corp <- df |> | |
dplyr::right_join( | |
tbl |> | |
dplyr::mutate(Attr = paste(Age, Sex, sep = "_"), .keep = "unused"), | |
by = "doc_id" | |
) |> | |
quanteda::corpus() | |
require(ca) | |
cam <- corp |> | |
quanteda::tokens(what = "fastestword") |> | |
quanteda::dfm() |> | |
quanteda::dfm_group(groups = quanteda::docvars(corp, "Attr")) |> | |
quanteda::dfm_trim(min_termfreq = 10, max_termfreq = 50) |> | |
quanteda.textmodels::textmodel_ca() | |
plot(cam) | |
require(ggplot2) | |
dplyr::bind_rows( | |
as.data.frame(cam$rowcoord), | |
as.data.frame(cam$colcoord) | |
) |> | |
tibble::rownames_to_column() |> | |
dplyr::mutate(shape = stringr::str_detect(rowname, "_")) |> | |
ggplot(aes(Dim1, Dim2, shape = shape, colour = shape)) + | |
geom_vline(xintercept = 0, colour = "grey", linetype = "dashed") + | |
geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") + | |
geom_point() + | |
ggrepel::geom_text_repel(aes(label = rowname), max.overlaps = 20) + | |
theme_light() + | |
theme(legend.position = "none") | |
df <- tbl |> | |
dplyr::select(!Region) |> | |
gibasa::tokenize(Opinion) |> | |
gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |> | |
dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
gibasa::mute_tokens( | |
(!POS1 %in% c("名詞", "形容詞")) | | |
(token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行")) | |
) |> | |
dtplyr::lazy_dt() |> | |
dplyr::group_by(doc_id) |> | |
dplyr::mutate(term1 = token, term2 = dplyr::lead(term1)) |> | |
dplyr::add_count(term1, term2, name = "cooc") |> | |
dplyr::ungroup() |> | |
dplyr::select(doc_id, token, term1, term2, cooc) | |
biterms <- df |> | |
dplyr::select(doc_id, term1, term2, cooc) |> | |
tidyr::drop_na() |> | |
dplyr::as_tibble() | |
model <- df |> | |
dplyr::select(doc_id, token) |> | |
dplyr::as_tibble() |> | |
BTM::BTM(k = 5, background = TRUE, biterms = biterms) | |
plot(model, top_n = 20) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R Benchmark 2.5 (06/2008) [Simon Urbanek] | |
# version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro | |
# R Benchmark 2.4 (06/2008) [Simon Urbanek] | |
# version 2.4 adapted to more recent Matrix package | |
# R Benchmark 2.3 (21 April 2004) | |
# Warning: changes are not carefully checked yet! | |
# version 2.3 adapted to R 1.9.0 | |
# Many thanks to Douglas Bates ([email protected]) for improvements! | |
# version 2.2 adapted to R 1.8.0 | |
# version 2.1 adapted to R 1.7.0 | |
# version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2 | |
# using the standard ATLAS library (Rblas.dll) | |
# on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro | |
# revised and optimized for R v. 1.5.x, 8 June 2002 | |
# Requires additionnal libraries: Matrix, SuppDists | |
# Author : Philippe Grosjean | |
# eMail : [email protected] | |
# Web : http://www.sciviews.org | |
# License: GPL 2 or above at your convenience (see: http://www.gnu.org) | |
# | |
# Several tests are adapted from the Splus Benchmark Test V. 2 | |
# by Stephan Steinhaus ([email protected]) | |
# Reference for Escoufier's equivalents vectors (test III.5): | |
# Escoufier Y., 1970. Echantillonnage dans une population de variables | |
# aleatoires r?elles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47. | |
# | |
# type source("c:/<dir>/R2.R") to start the test | |
runs <- 3 # Number of times the tests are executed | |
times <- rep(0, 15); dim(times) <- c(5,3) | |
require(Matrix) # Optimized matrix operations | |
#require(SuppDists) # Optimized random number generators | |
#Runif <- rMWC1019 # The fast uniform number generator | |
Runif <- runif | |
# If you don't have SuppDists, you can use: Runif <- runif | |
#a <- rMWC1019(10, new.start=TRUE, seed=492166) # Init. the generator | |
#Rnorm <- rziggurat # The fast normal number generator | |
# If you don't have SuppDists, you can use: Rnorm <- rnorm | |
#b <- rziggurat(10, new.start=TRUE) # Init. the generator | |
Rnorm <- rnorm | |
#remove("a", "b") | |
options(object.size=100000000) | |
cat("\n\n R Benchmark 2.5\n") | |
cat(" ===============\n") | |
cat(c("Number of times each test is run__________________________: ", runs)) | |
cat("\n\n") | |
cat(" I. Matrix calculation\n") | |
cat(" ---------------------\n") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (1) | |
cumulate <- 0; a <- 0; b <- 0 | |
for (i in 1:runs) { | |
invisible(gc()) | |
timing <- system.time({ | |
a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500); | |
b <- t(a); | |
dim(b) <- c(1250, 5000); | |
a <- t(b) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[1, 1] <- timing | |
cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (2) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500)); | |
invisible(gc()) | |
timing <- system.time({ | |
b <- a^1000 | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[2, 1] <- timing | |
cat(c("2400x2400 normal distributed random matrix ^1000____ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (3) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- Rnorm(7000000) | |
invisible(gc()) | |
timing <- system.time({ | |
b <- sort(a, method="quick") # Sort is modified in v. 1.5.x | |
# And there is now a quick method that better competes with other packages!!! | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[3, 1] <- timing | |
cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (4) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800) | |
invisible(gc()) | |
timing <- system.time({ | |
b <- crossprod(a) # equivalent to: b <- t(a) %*% a | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[4, 1] <- timing | |
cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (5) | |
cumulate <- 0; c <- 0; qra <-0 | |
for (i in 1:runs) { | |
a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000))) | |
b <- as.double(1:2000) | |
invisible(gc()) | |
timing <- system.time({ | |
c <- solve(crossprod(a), crossprod(a,b)) | |
})[3] | |
cumulate <- cumulate + timing | |
# This is the old method | |
#a <- Rnorm(600*600); dim(a) <- c(600,600) | |
#b <- 1:600 | |
#invisible(gc()) | |
#timing <- system.time({ | |
# qra <- qr(a, tol = 1e-7); | |
# c <- qr.coef(qra, b) | |
# #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients | |
#})[3] | |
#cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[5, 1] <- timing | |
cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing, "\n")) | |
remove("a", "b", "c", "qra") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
times[ , 1] <- sort(times[ , 1]) | |
cat(" --------------------------------------------\n") | |
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n")) | |
cat(" II. Matrix functions\n") | |
cat(" --------------------\n") | |
if (R.Version()$os == "Win32") flush.console() | |
# (1) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- Rnorm(2400000) | |
invisible(gc()) | |
timing <- system.time({ | |
b <- fft(a) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[1, 2] <- timing | |
cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (2) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- array(Rnorm(600*600), dim = c(600, 600)) | |
# Only needed if using eigen.Matrix(): Matrix.class(a) | |
invisible(gc()) | |
timing <- system.time({ | |
b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value | |
# Rem: on my machine, it is faster than: | |
# b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value | |
# b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value | |
# b <- eigen.Matrix(a, vectors = F)$Value | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[2, 2] <- timing | |
cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (3) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500) | |
#Matrix.class(a) | |
invisible(gc()) | |
timing <- system.time({ | |
#b <- determinant(a, logarithm=F) | |
# Rem: the following is slower on my computer! | |
# b <- det.default(a) | |
b <- det(a) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[3, 2] <- timing | |
cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (4) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000), | |
Dim = as.integer(c(3000, 3000)))) | |
invisible(gc()) | |
#a <- Rnorm(900*900); dim(a) <- c(900, 900) | |
#a <- crossprod(a, a) | |
timing <- system.time({ | |
b <- chol(a) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[4, 2] <- timing | |
cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (5) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600))) | |
invisible(gc()) | |
#a <- Rnorm(400*400); dim(a) <- c(400, 400) | |
timing <- system.time({ | |
# b <- qr.solve(a) | |
# Rem: a little faster than | |
b <- solve(a) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[5, 2] <- timing | |
cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
times[ , 2] <- sort(times[ , 2]) | |
cat(" --------------------------------------------\n") | |
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n")) | |
cat(" III. Programmation\n") | |
cat(" ------------------\n") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (1) | |
cumulate <- 0; a <- 0; b <- 0; phi <- 1.6180339887498949 | |
for (i in 1:runs) { | |
a <- floor(Runif(3500000)*1000) | |
invisible(gc()) | |
timing <- system.time({ | |
b <- (phi^a - (-phi)^(-a))/sqrt(5) | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[1, 3] <- timing | |
cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing, "\n")) | |
remove("a", "b", "phi") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (2) | |
cumulate <- 0; a <- 3000; b <- 0 | |
for (i in 1:runs) { | |
invisible(gc()) | |
timing <- system.time({ | |
b <- rep(1:a, a); dim(b) <- c(a, a); | |
b <- 1 / (t(b) + 0:(a-1)) | |
# Rem: this is twice as fast as the following code proposed by R programmers | |
# a <- 1:a; b <- 1 / outer(a - 1, a, "+") | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[2, 3] <- timing | |
cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing, "\n")) | |
remove("a", "b") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (3) | |
cumulate <- 0; c <- 0 | |
gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}} | |
for (i in 1:runs) { | |
a <- ceiling(Runif(400000)*1000) | |
b <- ceiling(Runif(400000)*1000) | |
invisible(gc()) | |
timing <- system.time({ | |
c <- gcd2(a, b) # gcd2 is a recursive function | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[3, 3] <- timing | |
cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing, "\n")) | |
remove("a", "b", "c", "gcd2") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (4) | |
cumulate <- 0; b <- 0 | |
for (i in 1:runs) { | |
b <- rep(0, 500*500); dim(b) <- c(500, 500) | |
invisible(gc()) | |
timing <- system.time({ | |
# Rem: there are faster ways to do this | |
# but here we want to time loops (220*220 'for' loops)! | |
for (j in 1:500) { | |
for (k in 1:500) { | |
b[k,j] <- abs(j - k) + 1 | |
} | |
} | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
timing <- cumulate/runs | |
times[4, 3] <- timing | |
cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing, "\n")) | |
remove("b", "j", "k") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
# (5) | |
cumulate <- 0; p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0; | |
x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0 | |
# Calculate the trace of a matrix (sum of its diagonal elements) | |
Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)} | |
for (i in 1:runs) { | |
x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45) | |
invisible(gc()) | |
timing <- system.time({ | |
# Calculation of Escoufier's equivalent vectors | |
p <- ncol(x) | |
vt <- 1:p # Variables to test | |
vr <- NULL # Result: ordered variables | |
RV <- 1:p # Result: correlations | |
vrt <- NULL | |
for (j in 1:p) { # loop on the variable number | |
Rvmax <- 0 | |
for (k in 1:(p-j+1)) { # loop on the variables | |
x2 <- cbind(x, x[,vr], x[,vt[k]]) | |
R <- cor(x2) # Correlations table | |
Ryy <- R[1:p, 1:p] | |
Rxx <- R[(p+1):(p+j), (p+1):(p+j)] | |
Rxy <- R[(p+1):(p+j), 1:p] | |
Ryx <- t(Rxy) | |
rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation | |
if (rvt > Rvmax) { | |
Rvmax <- rvt # test of RV | |
vrt <- vt[k] # temporary held variable | |
} | |
} | |
vr[j] <- vrt # Result: variable | |
RV[j] <- Rvmax # Result: correlation | |
vt <- vt[vt!=vr[j]] # reidentify variables to test | |
} | |
})[3] | |
cumulate <- cumulate + timing | |
} | |
times[5, 3] <- timing | |
cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing, "\n")) | |
remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k") | |
remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace") | |
if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
times[ , 3] <- sort(times[ , 3]) | |
cat(" --------------------------------------------\n") | |
cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n")) | |
cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n")) | |
cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n")) | |
remove("cumulate", "timing", "times", "runs", "i") | |
cat(" --- End of test ---\n\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
suppressPackageStartupMessages({ | |
require(tidymodels) | |
}) | |
ames_split <- initial_split(modeldata::ames, strata = Sale_Price) | |
ames_train <- training(ames_split) | |
ames_test <- testing(ames_split) | |
ames_rec <- recipe(Sale_Price ~ ., data = ames_train) |> | |
step_mutate(Sale_Price = log1p(Sale_Price)) |> | |
step_nzv(all_predictors()) |> | |
step_select( | |
all_outcomes(), | |
all_numeric_predictors(), | |
!starts_with("Year"), !ends_with("tude") | |
) |> | |
step_normalize(all_numeric_predictors()) | |
# injection | |
# https://parsnip.tidymodels.org/reference/model_spec.html | |
num_trees <- 1200L | |
ames_spec <- rand_forest(mtry = tune(), trees = !!num_trees) |> | |
set_mode("regression") |> | |
set_engine("ranger") |> | |
set_args(num.threads = !!parallelly::availableCores(omit = 1), seed = 1234) | |
str(ames_spec) | |
base_wf <- | |
workflow() |> | |
add_recipe(ames_rec) |> | |
add_model( | |
rand_forest(trees = !!num_trees) |> | |
set_engine("ranger") |> | |
set_mode("regression") |> | |
set_args(num.threads = !!parallelly::availableCores(omit = 1)), | |
formula = expm1(Sale_Price) ~ . | |
) |> | |
fit(ames_train) | |
augment( | |
extract_fit_parsnip(base_wf), | |
new_data = prep(ames_rec) |> bake(new_data = ames_test) | |
) |> | |
dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す | |
metrics(truth = Sale_Price, estimate = .pred) | |
ames_grid <- workflow() |> | |
add_recipe(ames_rec) |> | |
add_model(ames_spec, formula = Sale_Price ~ .) |> | |
tune_grid( | |
resamples = vfold_cv(ames_train, v = 3), | |
grid = grid_latin_hypercube( | |
extract_parameter_set_dials(ames_spec) |> | |
finalize(prep(ames_rec) |> bake(new_data = NULL)), | |
size = 10 | |
), | |
control = control_grid(verbose = FALSE), | |
metrics = metric_set(rmse) | |
) | |
select_best(ames_grid) | |
show_best(ames_grid) | |
best_wf <- | |
workflow() |> | |
add_recipe(ames_rec) |> | |
add_model(ames_spec, formula = expm1(Sale_Price) ~ .) |> | |
finalize_workflow(select_best(ames_grid)) | |
best_fit <- last_fit(best_wf, ames_split) | |
collect_predictions(best_fit) | |
collect_predictions(best_fit) |> | |
dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す | |
metrics(truth = Sale_Price, estimate = .pred) | |
# model formula | |
# https://github.com/tidymodels/parsnip/blob/8f13c1c41ce603261f25af64694c253f618fa999/R/model_formula.R | |
linear_reg(penalty = 1.0) |> | |
set_engine("glmnet") |> | |
set_mode("regression") | |
### baritsu | |
suppressPackageStartupMessages({ | |
require(tidymodels) | |
require(baritsu) | |
}) | |
#### classification | |
data("penguins", package = "modeldata") | |
data_split <- initial_split(penguins, strata = species, prop = .7) | |
penguins_train <- training(data_split) | |
rec <- | |
recipe( | |
species ~ ., | |
data = penguins_train | |
) |> | |
step_impute_mode(all_nominal()) |> | |
step_impute_median(all_numeric_predictors()) |> | |
step_zv(all_numeric_predictors()) |> | |
step_scale(all_numeric_predictors()) |> | |
step_dummy(all_nominal_predictors()) | |
spec <- svm_linear( | |
margin = tune() | |
) |> | |
set_engine("baritsu", penalty = tune()) |> | |
set_mode("classification") | |
wf <- workflow() |> | |
add_recipe(rec) |> | |
add_model(spec) | |
wf_fit <- wf |> | |
tune_grid( | |
resamples = vfold_cv(penguins_train, v = 5, strata = species), | |
grid = grid_max_entropy( | |
svm_margin(), | |
penalty(range = c(-1, 0)), | |
size = 5 | |
), | |
metrics = metric_set(f_meas), | |
control = control_grid(verbose = FALSE) | |
) | |
show_best(wf_fit) | |
best_wf_fit <- wf |> | |
finalize_workflow(select_best(wf_fit)) |> | |
last_fit(data_split, metrics = metric_set(f_meas)) | |
collect_metrics(best_wf_fit) | |
#### regression | |
pkgload::load_all(export_all = FALSE) | |
ames <- modeldata::ames |> | |
dplyr::slice_sample(n = 100) |> | |
dplyr::select(Sale_Price, Lot_Area, Total_Bsmt_SF, First_Flr_SF) | |
data_split <- rsample::initial_split(ames) | |
ames_train <- rsample::training(data_split) | |
ames_test <- rsample::testing(data_split) | |
rec <- | |
recipes::recipe( | |
Sale_Price ~ ., | |
data = ames_train | |
) |> | |
recipes::step_scale(recipes::all_numeric_predictors()) | |
spec <- lars( | |
penalty_L1 = tune::tune(), | |
penalty_L2 = tune::tune() | |
) |> | |
parsnip::set_engine("baritsu") |> | |
parsnip::set_mode("regression") | |
wf <- | |
workflows::workflow() |> | |
workflows::add_recipe(rec) |> | |
workflows::add_model(spec) | |
wf_fit <- wf |> | |
tune::tune_grid( | |
resamples = rsample::vfold_cv(ames_train, v = 5), | |
grid = dials::grid_max_entropy( | |
dials::penalty_L1(range = c(-10, 0)), | |
dials::penalty_L2(range = c(-10, 0)), | |
size = 10 | |
), | |
metrics = yardstick::metric_set(yardstick::rmse), | |
control = tune::control_grid(verbose = TRUE) | |
) | |
penguins <- modeldata::penguins | |
data_split <- rsample::initial_split(penguins, strata = "species") | |
penguins_train <- rsample::training(data_split) | |
penguins_test <- rsample::testing(data_split) | |
rec <- | |
recipes::recipe( | |
species ~ ., | |
data = penguins_train | |
) |> | |
# recipes::step_select(recipes::all_outcomes(), recipes::all_numeric_predictors()) |> | |
recipes::step_impute_mode(recipes::all_nominal()) |> | |
recipes::step_impute_median(recipes::all_numeric_predictors()) |> | |
recipes::step_scale(recipes::all_numeric_predictors()) | |
my_dat <- recipes::prep(rec) |> recipes::bake(new_data = NULL) | |
my_dat2 <- recipes::prep(rec) |> recipes::bake(new_data = penguins_test) | |
wf_fit <- wf |> | |
tune::tune_grid( | |
resamples = rsample::vfold_cv(penguins_train, v = 3), | |
grid = dials::grid_max_entropy( | |
dials::penalty(), | |
size = 5 | |
), | |
metrics = yardstick::metric_set(yardstick::f_meas), | |
control = tune::control_grid(verbose = TRUE) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(audio.whisper) | |
model <- whisper("models/ggml-base.bin") | |
trans <- predict(model, "output.wav", language = "ja") | |
arrow::write_parquet(trans$tokens, "output.wav.parquet") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment