paithiov909 · May 8, 2025 11:45
diff --git a/animate-population-akita.R b/animate-population-akita.R
 akita <- jisx0402::municipality |>
  dplyr::filter(
    pref_code == "05",
    is.na(end_date),
    !stringr::str_ends(name, "郡")
  ) |>
  dplyr::mutate(
    muni_code = paste0(pref_code, city_code),
    muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
    start_year = lubridate::year(start_date)
  ) |>
  dplyr::select(muni_code, name, start_year) |>
  dplyr::arrange(desc(start_year))

 year <- max(akita$start_year, na.rm = TRUE)

 dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
  dplyr::filter(.data$year >= .env$year) |>
  dplyr::right_join(akita, by = c("code" = "muni_code"))

 require(ggplot2)

 dat |>
  dplyr::filter(name == "秋田市", age == "all") |>
  ggplot(aes(year, population)) +
  geom_line(aes(colour = sex)) +
  scale_y_log10()

 ga <- jisx0402::jptopography("designated") |>
  dplyr::filter(stringr::str_starts(muni_code, "05")) |>
  rlang::as_function(
    ~ dplyr::mutate(.,
        muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
        area = units::set_units(sf::st_area(.), km^2)
      )
  )() |>
  dplyr::right_join(
    dplyr::filter(dat, age == "all"),
    by = c("muni_code" = "code")
  ) |>
  dplyr::group_by(sex) |>
  dplyr::mutate(
    denst = round(population / area),
    rank = dplyr::percent_rank(denst) * 10
  ) |>
  dplyr::ungroup() |>
  ggplot() +
  geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
  facet_wrap(~ sex) +
  labs(
    title = "Rank of Population Density in Akita, throughout 2006-2023",
    subtitle = "Year: {frame_time}",
    caption = paste(
      "地図データ：「国土数値情報 行政区域データ」（国土交通省）",
      "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
      sep = "\n"
    )
  ) +
  theme_light() +
  gganimate::transition_time(year)

 gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
 gganimate::anim_save("akita-denst1.mp4", path = "Videos")
diff --git a/animate-population-tokyo.R b/animate-population-tokyo.R
 tokyo <- jisx0402::municipality |>
  dplyr::filter(
    pref_code == "13",
    is.na(end_date),
    !stringr::str_ends(name, "郡|(特別区)|(支庁)")
  ) |>
  dplyr::mutate(
    muni_code = paste0(pref_code, city_code),
    muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
    start_year = lubridate::year(start_date)
  ) |>
  dplyr::select(muni_code, name, start_year) |>
  dplyr::arrange(desc(start_year))

 year <- max(tokyo$start_year, na.rm = TRUE)

 dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |>
  dplyr::filter(.data$year >= .env$year) |>
  dplyr::right_join(tokyo, by = c("code" = "muni_code"))

 require(ggplot2)

 dat |>
  dplyr::filter(age == "all", stringr::str_detect(name, "区"),
                year > 2005L) |>
  ggplot(aes(year, population)) +
  geom_line(aes(colour = sex)) +
  facet_wrap(~ name) +
  scale_y_log10()

 ga <- jisx0402::jptopography("all", resolution = 1) |>
  dplyr::filter(stringr::str_starts(muni_code, "13")) |>
  rlang::as_function(
    ~ dplyr::mutate(.,
        muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
        area = units::set_units(sf::st_area(.), km^2)
      )
  )() |>
  dplyr::right_join(
    dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")),
    by = c("muni_code" = "code")
  ) |>
  dplyr::group_by(sex) |>
  dplyr::mutate(
    denst = round(population / area),
    rank = dplyr::percent_rank(denst) * 10
  ) |>
  dplyr::ungroup() |>
  ggplot() +
  geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
  facet_wrap(~ sex) +
  labs(
    title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023",
    subtitle = "Year: {frame_time}",
    caption = paste(
      "地図データ：「国土数値情報 行政区域データ」（国土交通省）",
      "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
      sep = "\n"
    )
  ) +
  theme_light() +
  gganimate::transition_time(year, range = c(2006L, 2023L))

 gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
 gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos")
diff --git a/asciiart.R b/asciiart.R
 #' @export
 accelerate <- function(text = "精神を加速させろ") {
  r"(
 　　　　　　　　　　　　∩＿∩
 　　　　　　　　　　 ／ ＼ ／＼
 　　　　　　　　　　|（ﾟ）=（ﾟ）|
 　　　　　　　　　　|　　●_●　 |
 　　　　　　　　　/　　　　　　ヽ
 　　　　　　　r⌒| 〃 ------ ヾ　|
 　　　　　　/　　i／　|＿二＿＿ノ
 　　　　　./　 ／　　/　　　　　　) 　　　  {text}
 　　　　 ./ ／　　／　　　　 　/／
 　　　　/　　　.／　　　　　/￣
 　　　 .ヽ､__.／　　　　 ／ ⌒ヽ
 　　 　　　　 r　　　 ／　 　　　|
 　　　　　　/　　 　　　　　　　ﾉ
 　　　　　/　　　　　 /　　　 /
 　　　　./　　　　／/　　 ／
 　　　 /.　　　.／ ./　 ／
 　　　i　　　／　./ ／
 　　　i　　./　.ノ.＾/
 　　　i　 ./　　|_／
 　　　i　/
 　　／ /
 　 (_／
  )" |> stringi::stri_replace_all_fixed("　", "  ") |> glue::glue()
 }

 #' @export
 qb <- function(text = "わけがわからないよ") {
  r"(
 　 　 　 　 |＼　　 　 　 　 　 ／|
 　 　 　 　 |＼＼　　 　 　 ／／|
 　　　　 　 : 　,>　｀´￣｀´　<　 ′
 .　　　　 　 Ｖ　 　 　 　 　 　 Ｖ
 .　　　　 　 i<　●　 　 　 ●　>i
 　　　　 　 八　 　 ､_,_, 　 　 八 　　　{text}
 . 　 　 　 /　个 . ＿　 ＿ . 个 ',
 　　　＿/ 　 il 　 ,'　　　 '.　 li　 ',＿_
  )" |> stringi::stri_replace_all_fixed("　", "  ") |> glue::glue()
 }
diff --git a/aznyan.R b/aznyan.R
 # get_lyrics.R -----

 #' Scrape lyrics from list
 #'
 #' @param df A tibble that comes of \code{get_lyrics_list}.
 #' @param file String; file name to append lyrircs.
 #' @param links String; column name of lyrics links.
 #' @returns `file` is returned invisibly.
 #' @export
 #' @examples
 #' \dontrun{
 #' csv_file <-
 #'  aznyan::get_lyrics_list("23729") |>
 #'  aznyan::get_lyrics("23729.csv")
 #' tbl <-
 #'  readr::read_csv(csv_file, col_names = F, col_types = "cccc___cDn") |>
 #'  dplyr::rename(
 #'    title = X1,
 #'    artist = X2,
 #'    lyricist = X3,
 #'    composer = X4,
 #'    text = X8,
 #'    released = X9,
 #'    page_view = X10
 #'  )
 #' }
 get_lyrics <- function(df, file, links = "link") {
  base_url <- "https://www.uta-net.com"
  links <- dplyr::pull(df, {{ links }})

  url <- paste(base_url, links, sep = "/")

  session <- polite::bow(base_url, force = FALSE)
  purrr::iwalk(url, function(q, itr) {
    html <- session %>%
      polite::nod(q) %>%
      polite::scrape()
    lyric_body <- html %>%
      rvest::html_element("#kashi_area") %>%
      rvest::html_text2()
    info <- html %>%
      rvest::html_element(".song-infoboard") %>%
      rvest::html_element(".detail") %>%
      rvest::html_text() %>%
      stringr::str_split("\\n") %>%
      unlist() %>%
      purrr::pluck(4) %>%
      stringr::str_extract_all(pattern = "[\\d,/]+") %>%
      unlist()

    data.frame(
      df[itr, ],
      lyric = lyric_body,
      released = info[1],
      page_viewed = info[2]
    ) %>%
      dplyr::mutate(
        released = lubridate::as_date(.data$released),
        page_viewed = stringr::str_remove_all(.data$page_viewed, ",") %>%
          unlist() %>%
          as.numeric()
      ) %>%
      readr::write_csv(file, append = TRUE, progress = FALSE)
  })
  invisible(file)
 }

 # lyrics_list.R -----

 #' Scrape table of lyrics list
 #'
 #' @param id String; substring xxx of 'https://www.uta-net.com/:type:/xxx/'.
 #' @param type String; one of "artist", "lyricist", or "composer".
 #' @returns tibble.
 #' @export
 get_lyrics_list <- function(id,
                            type = c("artist", "lyricist", "composer")) {
  base_url <- "https://www.uta-net.com"
  type <- rlang::arg_match(type, c("artist", "lyricist", "composer"))

  url <- paste(base_url, type, id, "", sep = "/")
  session <-
    polite::bow(base_url, force = FALSE) %>%
    polite::nod(url)

  html <- session %>%
    polite::scrape()

  page_list <- html %>%
    rvest::html_element(".songlist-table-block") %>%
    rvest::html_element("tfoot") %>%
    rvest::html_text() %>%
    stringr::str_extract("([:number:]+)")

  purrr::map_dfr(seq.int(as.integer(page_list)), function(i) {
    html <- session %>%
      polite::nod(path = paste(url, "0", as.character(i), "", sep = "/")) %>%
      polite::scrape()
    tables <- html %>%
      rvest::html_elements(".songlist-table-block") %>%
      rvest::html_elements("table")
    df <- tables %>%
      rvest::html_table() %>%
      purrr::map_dfr(~ stats::na.omit(.))
    titles <- html %>%
      rvest::html_elements(".songlist-table-block") %>%
      rvest::html_elements(".songlist-title") %>%
      rvest::html_text()
    links <- tables %>%
      rvest::html_elements(".sp-w-100") %>%
      rvest::html_elements("a") %>%
      rvest::html_attr("href") %>%
      purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/"))
    df %>%
      dplyr::slice_head(n = nrow(df) - 1) %>%
      dplyr::rename(
        text_lab = "\u66f2\u540d",
        artist = "\u6b4c\u624b\u540d",
        lyricist = "\u4f5c\u8a5e\u8005\u540d",
        composer = "\u4f5c\u66f2\u8005\u540d",
        leading = "\u6b4c\u3044\u51fa\u3057"
      ) %>%
      dplyr::bind_cols(
        data.frame(title = titles, link = links, source_page = i)
      ) %>%
      dplyr::select(
        .data$title,
        .data$artist,
        .data$lyricist,
        .data$composer,
        .data$leading,
        .data$link,
        .data$source_page
      )
  })
 }

 #' Search lyrics list by keyword
 #'
 #' @param keyword String; search phrase.
 #' @param sort String; one of "new", "popular", "title", or "artist".
 #' @return tibble
 #' @export
 search_lyrics_list <- function(keyword,
                               sort = c("new", "popular", "title", "artist")) {
  base_url <- "https://www.uta-net.com"
  sort <- rlang::arg_match(sort, c("new", "popular", "title", "artist"))
  sort <- dplyr::case_when(
    sort == "new" ~ 6,
    sort == "popular" ~ 4,
    sort == "title" ~ 1,
    sort == "artist" ~ 7,
    TRUE ~ 1
  )

  url <- paste(base_url, "search", "", sep = "/")
  session <-
    polite::bow(base_url, force = FALSE) %>%
    polite::nod(url)

  html <- session %>%
    polite::scrape(query = list(
      Keyword = stringr::str_trim(keyword),
      Aselect = "2",
      Bselect = "3",
      sort = sort
    ))
  page_list <- html %>%
    rvest::html_element("#songlist-sort-paging") %>%
    rvest::html_text() %>%
    stringr::str_extract("([:number:]+)")

  purrr::map_dfr(seq.int(as.integer(page_list)), function(i) {
    html <- session %>%
      polite::scrape(query = list(
        Keyword = enc2utf8(keyword),
        Aselect = "2",
        Bselect = "3",
        sort = sort,
        pnum = i
      ))
    tables <- html %>%
      rvest::html_elements(".songlist-table-block") %>%
      rvest::html_elements("table")
    df <- tables %>%
      rvest::html_table() %>%
      purrr::map_dfr(~ na.omit(.))
    titles <- html %>%
      rvest::html_elements(".songlist-table-block") %>%
      rvest::html_elements(".songlist-title") %>%
      rvest::html_text()
    links <- tables %>%
      rvest::html_elements(".sp-w-100") %>%
      rvest::html_elements("a") %>%
      rvest::html_attr("href") %>%
      purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/"))
    df %>%
      dplyr::slice_head(n = nrow(df) - 1) %>%
      dplyr::rename(
        text_lab = "\u66f2\u540d",
        artist = "\u6b4c\u624b\u540d",
        lyricist = "\u4f5c\u8a5e\u8005\u540d",
        composer = "\u4f5c\u66f2\u8005\u540d",
        leading = "\u6b4c\u3044\u51fa\u3057"
      ) %>%
      dplyr::bind_cols(
        data.frame(title = titles, link = links, source_page = i)
      ) %>%
      dplyr::select(
        .data$title,
        .data$artist,
        .data$lyricist,
        .data$composer,
        .data$leading,
        .data$link,
        .data$source_page
      )
  })
 }
diff --git a/get-help-text.qmd b/get-help-text.qmd
 ```{r}
 #| label: help-text
 #| echo: false
 #| output: asis
 text <- withr::with_options(list(useFancyQuotes = FALSE),
  capture.output({
    tools::Rd2HTML(
      tools::Rd_db("base")[["use.Rd"]],
      outputEncoding = "UTF-8"
    )
  }) |>
    paste0(collapse = "\n") |>
    rvest::read_html() |>
    rvest::html_nodes("main") |>
    rvest::html_text()
 )
 text |>
  stringr::str_split("\n") |>
  unlist(use.names = FALSE) |>
  rlang::as_function(~ paste("> ", .))() |>
  commonmark::markdown_commonmark() |>
  cat()
 ```
diff --git a/inuzaru-vocab-2.R b/inuzaru-vocab-2.R

 dat <-
  readr::read_csv("data/lower_care.csv") |>
  dplyr::mutate(
    issue = factor(issue),
    speaker = factor(speaker),
    speakerGroup = factor(speakerGroup),
    speech = stringi::stri_replace_all_regex(speech, "\\s+", "") |>
      stringi::stri_trans_nfkc()
  )

 # 名詞
 toks <- dat |>
  dplyr::select(speechID, speech, speaker, speakerGroup) |>
  gibasa::tokenize(
    speech,
    speechID,
    sys_dic = here::here("sudachidict/build")
  ) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = c("POS1", "normalized_form", "reading_form")
  ) |>
  dplyr::mutate(
    reading_form = stringi::stri_replace_all_regex(
      reading_form,
      "[ァィゥェォャュョ]",
      ""
    ),
    mora = stringi::stri_length(reading_form),
    .by = doc_id
  )

 noun <- toks |>
  dplyr::reframe(
    tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
    pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
    .by = doc_id
  ) |>
  dplyr::filter(
    pos == "名詞-助詞-名詞",
    stringi::stri_detect_fixed(tok, "-の-")
  ) |>
  dplyr::distinct(tok) |>
  dplyr::pull(tok) |>
  stringi::stri_extract_first_regex("^([[:alpha:]]+)")

 meishi <- toks |>
  dplyr::count(doc_id, token, mora) |>
  tidytext::bind_tf_idf(token, doc_id, n) |>
  dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |>
  dplyr::filter(dplyr::percent_rank(tf_idf) > .80) |>
  dplyr::distinct(token, mora)

 readr::write_csv(
  meishi,
  "vocab_n.csv"
 )

 # 述部
 ch <-
  stringi::stri_split_boundaries(
    dat$speech,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@ld=auto;lw=phrase"
    )
  ) |>
  purrr::map(
    ~ stringi::stri_subset_regex(., "。")
  ) |>
  unlist() |>
  unique() |>
  stringi::stri_replace_all_fixed("。", "") |>
  purrr::keep(~ stringi::stri_length(.) > 2)

 len <-
  gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = "reading_form"
  ) |>
  gibasa::as_tokens(reading_form, pos_field = NULL) |>
  purrr::map(
    ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |>
      stringi::stri_length() |>
      sum()
  ) |>
  unlist()

 jutsubu <-
  tibble::tibble(
    word = ch,
    mora = len
  ) |>
  dplyr::filter(
    !gibasa::is_blank(word),
    mora > 4, mora < 8
  ) |>
  dplyr::arrange(mora)

 readr::write_csv(
  jutsubu,
  "vocab_j.csv"
 )

 # 修飾部
 shushoku <- toks |>
  dplyr::filter(
    POS1 %in% c("形容詞")
  ) |>
  dplyr::distinct(normalized_form, .keep_all = TRUE) |>
  dplyr::select(token, mora) |>
  dplyr::arrange(mora)

 readr::write_csv(
  shushoku,
  "vocab_s.csv"
 )
diff --git a/inuzaru-vocab.R b/inuzaru-vocab.R

 dat <-
  readxl::read_xlsx(
    "data/39.xlsx",
    col_names = c(
      "title", "artist", "lyricist", "composer",
      "leading", "link", "source", "text",
      "release", "view"
    ),
    na = "NA"
  ) |>
  dplyr::reframe(
    text = stringi::stri_split_regex(text, "[\\s]{2,}") |>
      unlist() |>
      stringi::stri_replace_all_regex("[\\n]+", "。") |>
      stringi::stri_replace_all_regex("[「」（）\\(\\)]", "") |>
      stringi::stri_trans_tolower(),
    .by = link
  )

 # 名詞
 toks <- dat |>
  gibasa::tokenize(
    text,
    link,
    sys_dic = here::here("sudachidict/build")
  ) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = c("POS1", "normalized_form", "reading_form")
  ) |>
  dplyr::mutate(
    reading_form = stringi::stri_replace_all_regex(
      reading_form,
      "[ァィゥェォャュョ]",
      ""
    ),
    mora = stringi::stri_length(reading_form),
    .by = doc_id
  )

 noun <- toks |>
  dplyr::reframe(
    tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
    pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
    .by = doc_id
  ) |>
  dplyr::filter(
    pos == "名詞-助詞-名詞",
    stringi::stri_detect_fixed(tok, "-の-")
  ) |>
  dplyr::distinct(tok) |>
  dplyr::pull(tok) |>
  stringi::stri_extract_first_regex("^([[:alpha:]]+)")

 meishi <- toks |>
  dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |>
  dplyr::distinct(token, mora)

 readr::write_csv(
  meishi,
  "vocab_n.csv"
 )

 # 述部
 ch <-
  stringi::stri_split_boundaries(
    dat$text,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@ld=auto;lw=phrase"
    )
  ) |>
  purrr::map(
    ~ stringi::stri_subset_regex(., "。")
  ) |>
  unlist() |>
  unique() |>
  stringi::stri_replace_all_fixed("。", "") |>
  purrr::keep(~ stringi::stri_length(.) > 2)

 len <-
  gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |>
  gibasa::prettify(
    into = gibasa::get_dict_features("sudachi"),
    col_select = "reading_form"
  ) |>
  gibasa::as_tokens(reading_form, pos_field = NULL) |>
  purrr::map(
    ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |>
      stringi::stri_length() |>
      sum()
  ) |>
  unlist()

 jutsubu <-
  tibble::tibble(
    word = ch,
    mora = len
  ) |>
  dplyr::filter(
    !gibasa::is_blank(word),
    mora > 2, mora < 8
  ) |>
  dplyr::arrange(mora)

 readr::write_csv(
  jutsubu,
  "vocab_j.csv"
 )

 # 修飾部
 shushoku <- toks |>
  dplyr::filter(
    POS1 %in% c("形容詞", "副詞")
  ) |>
  dplyr::distinct(normalized_form, .keep_all = TRUE) |>
  dplyr::select(token, mora) |>
  dplyr::arrange(mora)

 readr::write_csv(
  shushoku,
  "vocab_s.csv"
 )
diff --git a/kagome.R b/kagome.R
 #' Environment for internal use
 #'
 #' @noRd
 #' @keywords internal
 .env <- rlang::env(server i= NULL)

 #' @noRd
 reset_encoding <- function(chr, encoding = "UTF-8") {
  Encoding(chr) <- encoding
  return(chr)
 }

 #' Call lattice command
 #'
 #' A debug tool of tokenize process outputs a lattice in graphviz dot format.
 #'
 #' @param sentences Character vector.
 #' @param ... Other arguments are passed to \code{DiagrammeR::grViz}.
 #'
 #' @export
 get_lattice <- function(sentences, ...) {
  senteces <- stringi::stri_omit_na(sentences)
  dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout
  DiagrammeR::grViz(reset_encoding(dot), ...)
 }

 #' Lanch Kagome Server
 #'
 #' Start or kill Kagome server process.
 #' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary).
 #' @return Logical value (whether or not the kagome server is alive?) is returned invisibly.
 #'
 #' @export
 lanch_server <- function(dict = c("ipa", "uni")) {
  p <- rlang::env_get(.env, "server")
  if (is.null(p)) {
    dict <- rlang::arg_match(dict)
    p <- processx::process$new("kagome", c("server", "-dict", dict))

    rlang::env_bind(.env, server = p)
  } else {
    if (p$is_alive()) {
      kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?")
    }
    if (kill) {
      p$kill()
      rlang::env_bind(.env, server = NULL)
    }
  }
  return(invisible(p$is_alive()))
 }

 #' Send a HEAD request to Kagome server
 #'
 #' @param url URL Character scalar.
 #' @return httr2 response is returned invisibly.
 #'
 #' @export
 ping <- function(url = "http://localhost:6060") {
  resp <-
    httr2::request(url) |>
    httr2::req_method("HEAD") |>
    httr2::req_perform()
  return(invisible(resp))
 }

 #' Put request to tokenize API
 #'
 #' @param sentences Character vector to be analyzed.
 #' @param url URL of Kagome server.
 #' @param mode One of `normal`, `search` or `extended`.
 #' @return tibble
 #'
 #' @export
 tokenize <- function(sentences,
                     url = "http://localhost:6060/tokenize",
                     mode = c("normal", "search", "extended")) {
  mode <- rlang::arg_match(mode)

  sentences <-
    stringi::stri_omit_na(sentences) |>
    stringi::stri_split_boundaries(type = "sentence") |>
    purrr::flatten_chr()

  resps <-
    furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) |>
    dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) |>
    dplyr::mutate_at(c("doc_id", "class"), as.factor) |>
    tibble::as_tibble()

  return(resps)
 }

 #' @noRd
 tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") {
  resp <-
    httr2::request(url) |>
    httr2::req_body_json(list(
      sentence = msg,
      mode = mode
    )) |>
    httr2::req_method("PUT") |>
    httr2::req_error(function(resp) httr2::resp_status(resp) > 400) |>
    httr2::req_perform()

  return(
    purrr::map_dfr(
      httr2::resp_body_json(resp)$tokens,
      ~ data.frame(
        doc_id = idx,
        id = .$id,
        start = .$start,
        end = .$end,
        class = .$class,
        token = .$surface,
        POS1 = .$pos[[1]],
        POS2 = .$pos[[2]],
        POS3 = .$pos[[3]],
        POS4 = .$pos[[4]],
        Original = .$base_form,
        Yomi1 = .$reading,
        Yomi2 = .$pronunciation
      )
    )
  )
 }

 .on_unload <- function(ns) {
  p <- rlang::env_get(.env, "server")
  if (!is.null(p) && p$is_alive()) {
    p$kill()
  }
 }
diff --git a/lexrank.R b/lexrank.R
 texts <- ldccr::read_aozora(
  "https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
  directory = tempdir()
 )

 sent <- readr::read_lines(texts) |>
  gibasa::tokenize(split = TRUE) |>
  gibasa::prettify() |>
  dplyr::filter(!POS1 %in% c("記号")) |>
  dplyr::group_by(doc_id, sentence_id) |>
  dplyr::reframe(
    text = paste0(token, collapse = " ")
  ) |>
  dplyr::mutate(
    doc_id = doc_id,
    sentence_id = paste(doc_id, sentence_id, sep = "_")
  )

 ### lexRankr ----

 tok <- sent |>
  tidytext::unnest_tokens(
    token,
    text,
    token = \(x) strsplit(x, " ", fixed = TRUE)
  )

 simil_df <- lexRankr::sentenceSimil(
  sentenceId = tok$sentence_id,
  token = tok$token,
  docId = tok$doc_id
 )

 top_n_sent <-
  lexRankr::lexRankFromSimil(
    simil_df$sent1,
    simil_df$sent2,
    simil = simil_df$similVal,
    n = 10,
    usePageRank = TRUE,
    continuous = TRUE,
    # threshold = 0.1,
    returnTies = FALSE
  )

 top_n_sent |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()

 ### replace sentenceSimil ----

 mat <-
  sent |>
  tidytext::unnest_tokens(
    term,
    text,
    token = \(x) strsplit(x, " ", fixed = TRUE)
  ) |>
  dplyr::count(doc_id, term) |>
  tidytext::bind_tf_idf(term, doc_id, n) |>
  dplyr::mutate(tf_idf = n * (idf + 1)) |>
  dplyr::inner_join(
    sent |>
      tidytext::unnest_tokens(
        term,
        text,
        token = \(x) strsplit(x, " ", fixed = TRUE)
      ) |>
      dplyr::select(doc_id, sentence_id, term),
    by = c("doc_id" = "doc_id", "term" = "term")
  ) |>
  tidytext::cast_sparse(sentence_id, term, tf_idf)

 dt <- proxyC::simil(
  mat,
  method = "cosine",
  # min_simil = 0.1,
  # rank = 50,
  use_nan = FALSE
 )

 dt |>
  tidytext:::tidy.dfm() |>
  dplyr::rename(
    s1 = document,
    s2 = term,
    weight = count
  ) |>
  (\(simil_df) {
    lexRankr::lexRankFromSimil(
      simil_df$s1,
      simil_df$s2,
      simil = simil_df$weight,
      n = nrow(simil_df),
      usePageRank = TRUE,
      continuous = TRUE,
      # threshold = 0.1,
      returnTies = TRUE
    )
  })() |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()

 ### replace lexRankFromSimil ----

 dt |>
  tidytext:::tidy.dfm() |>
  dplyr::rename(
    s1 = document,
    s2 = term,
    weight = count
  ) |>
  (\(simil_df) {
    pr <- simil_df |>
      igraph::graph_from_data_frame(
        directed = FALSE
      ) |>
      igraph::page_rank(
        directed = FALSE,
        damping = .85
      ) |>
      purrr::pluck("vector")
    tibble::tibble(
      sentenceId = names(pr),
      value = unname(pr)
    )
  })() |>
  dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |>
  dplyr::arrange(desc(value)) |>
  dplyr::as_tibble()
diff --git a/ngrams.R b/ngrams.R
 texts <- ldccr::read_aozora(
  "https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
  directory = tempdir()
 )

 sent <- readr::read_lines(texts) |>
  gibasa::tokenize() |>
  gibasa::prettify() |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  dplyr::reframe(
    token = gibasa::ngram_tokenizer(2)(token, sep = "-"),
    pos = gibasa::ngram_tokenizer(2)(POS1, sep = "-"),
    .by = doc_id
  ) |>
  dplyr::count(doc_id, token, pos) |>
  gibasa::bind_tf_idf2(
    token, doc_id, n,
    tf = "tf2", idf = "idf3", norm = TRUE
  ) |>
  dplyr::filter(!stringr::str_detect(pos, "(助詞)|(記号)")) |>
  dplyr::summarise(
    value = sum(tf_idf) |> round(digits = 2),
    .by = token
  ) |>
  tidyr::separate_wider_delim(
    token,
    delim = "-",
    names = c("source", "target")
  ) |>
  dplyr::slice_max(value, n = 40)

 nodes <- readr::read_lines(texts) |>
  gibasa::tokenize() |>
  gibasa::prettify() |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  dplyr::filter(token %in% unique(c(sent$source, sent$target))) |>
  dplyr::count(token) |>
  dplyr::transmute(
    id = token,
    value = n,
    cluster = sent |>
      igraph::graph_from_data_frame() |>
      igraph::cluster_label_prop() |>
      igraph::membership() |>
      rlang::as_function(~ .[id])() |>
      unname() |>
      factor() |>
      forcats::fct_lump(n = 5, other_level = "0") |>
      (\(f) {
        forcats::fct_relabel(f,
          ~ scales::viridis_pal(alpha = .8, option = "H")(nlevels(f))
        )
      })()
  )

 jsonlite::write_json(
  nodes,
  file.path(here::here(), "src/components/data/nodes.json")
 )
 jsonlite::write_json(
  sent,
  file.path(here::here(), "src/components/data/edges.json")
 )
diff --git a/poem-score.R b/poem-score.R
 require(sudachir)
 require(apportita)

 strj_split_boundaries <- \(x) {
  stringi::stri_split_boundaries(
    x,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
    )
  )
 }

 split_kugire <- \(x) {
  strj_split_boundaries(x) |>
    purrr::map(\(elem) {
      len <- length(elem)
      if (len < 2) {
        return(NA_character_)
      } else {
        sapply(seq_len(len - 1), \(i) {
          s1 <- paste0(elem[1:i], collapse = "")
          s2 <- paste0(elem[(i + 1):len], collapse = "")
          paste(s1, s2, sep = "\t")
        })
      }
    }) |>
    unlist()
 }

 sudachi <- sudachir::rebuild_tokenizer(mode = "C")
 form <- \(x) {
  unlist(sudachir::form(x, type = "normalized", pos = FALSE,
                        instance = sudachi))
 }
 wrd <- \(conn, s1, s2) {
  purrr::map2_dbl(s1, s2, \(el1, el2) {
    el1 <- form(el1)
    el2 <- form(el2)
    apportita::calc_wrd(conn, el1, el2)
  })
 }

 conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
 dim(conn)


 ### tanka72 ----
 dat <-
  readxl::read_xlsx("data/tanka/tanka72.xlsx") |>
  dplyr::transmute(
    id = factor(id),
    body = audubon::strj_normalize(body) |>
      stringr::str_remove_all("[^[:alnum:]]+"),
    author = factor(author)
  )

 ret <- dat |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    author = author,
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

 ret <- ret |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

 arrow::write_parquet(ret, "tanka72-wrd.parquet")

 ### tweets ----
 tweets <-
  readr::read_csv(
    "data/shinabitanori-230622.csv.gz",
    col_names = c("id", "time", "body"),
    col_types = "ccc"
  ) |>
  dplyr::filter(
    !stringr::str_detect(body, "@|(RT)"),
    stringr::str_length(body) > 14,
    stringr::str_length(body) < 30
  ) |>
  dplyr::slice_sample(n = 100) |>
  dplyr::transmute(
    id = factor(id),
    body = audubon::strj_normalize(body) |>
      ldccr::clean_url() |>
      stringr::str_remove_all("[^[:alnum:]]+")
  ) |>
  dplyr::filter(
    !stringi::stri_isempty(body)
  )

 tweets <- tweets |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

 tweets <- tweets |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

 arrow::write_parquet(tweets, "shinabitanori-wrd.parquet")


 close(conn)
 rm(conn)
diff --git a/poem-score2.R b/poem-score2.R
 require(sudachir)
 require(apportita)

 strj_split_boundaries <- \(x) {
  stringi::stri_split_boundaries(
    x,
    opts_brkiter = stringi::stri_opts_brkiter(
      locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere
    )
  )
 }

 split_kugire <- \(x) {
  strj_split_boundaries(x) |>
    purrr::map(\(elem) {
      len <- length(elem)
      if (len < 2) {
        return(NA_character_)
      } else {
        sapply(seq_len(len - 1), \(i) {
          s1 <- paste0(elem[1:i], collapse = "")
          s2 <- paste0(elem[(i + 1):len], collapse = "")
          paste(s1, s2, sep = "\t")
        })
      }
    }) |>
    unlist()
 }

 sudachi <- sudachir::rebuild_tokenizer(mode = "C")
 form <- \(x) {
  unlist(sudachir::form(x, type = "normalized", pos = FALSE,
                        instance = sudachi))
 }
 wrd <- \(conn, s1, s2) {
  purrr::map2_dbl(s1, s2, \(el1, el2) {
    el1 <- form(el1)
    el2 <- form(el2)
    tryCatch(
      apportita::calc_wrd(conn, el1, el2),
      error = \(e) {
        NA
      }
    )
  }, .progress = TRUE)
 }

 conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
 dim(conn)


 ### tanka ----
 dat <-
  readxl::read_excel(
    "data/tanka/1001-1500_poems.xlsx",
    col_names = c("id", "poems", "names", "loves", "likes", "keys1", "keys2")
  ) |>
  # dplyr::slice_sample(n = 20) |>
  dplyr::transmute(
    id = id,
    names = stringr::str_remove_all(names, "[(<U\\+[A-Z]>)]") |>
      audubon::strj_normalize() |>
      stringr::str_remove_all("[^[:alnum:]]+"),
    body = stringr::str_remove_all(poems, "[(<U\\+[A-Z]>)]") |>
      audubon::strj_normalize() |>
      stringr::str_remove_all("[^[:alnum:]]+")
  ) |>
  dplyr::filter(
    !stringi::stri_isempty(body)
  )

 dat <- dat |>
  dplyr::reframe(
    id = id,
    phrase = split_kugire(body),
    .by = id
  ) |>
  tidyr::separate_wider_delim(
    phrase, delim = "\t", names = c("s1", "s2")
  )

 arrow::write_parquet(dat, "tanka-kugire.parquet")
 print("区切り処理終わり")

 dat <- dat |>
  tidyr::drop_na() |>
  dplyr::mutate(
    wrd = wrd(conn, s1, s2)
  )

 arrow::write_parquet(dat, "tanka-wrd.parquet")


 close(conn)
 rm(conn)
diff --git a/qiita-download.R b/qiita-download.R
 clean_url <- \(text) {
  pat <- "(https?|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?"
  stringr::str_replace_all(text, pattern = pat, replacement = "URL")
 }

 to_df <- \(jsonlist) {
  purrr::map(jsonlist, \(elem) {
    text <- elem$rendered_body |>
      rvest::read_html() |>
      rvest::html_elements("p") |>
      rvest::html_text() |>
      paste(collapse = "\n")
    tags <- elem$tags |>
      purrr::map_chr(~ .x$name) |>
      paste(collapse = ",")

    tibble::tibble(
      "doc_id" = elem$id,
      "created" = elem$created_at,
      "updated" = elem$updated_at,
      "author" = elem$user$permanent_id,
      "title" = audubon::strj_normalize(elem$title),
      "text" = audubon::strj_normalize(text),
      "tags" = tags,
      "comments" = elem$comments_count,
      "likes" = elem$likes_count,
      "reactions" = elem$reactions_count,
      "stocks" = elem$stocks_count)
  }) |>
    purrr::list_rbind() |>
    dplyr::filter(
      cld3::detect_language(text) == "ja"
    ) |>
    dplyr::mutate(
      created = lubridate::as_date(created),
      updated = lubridate::as_date(updated)
    )
 }
 PAGES <- ceiling(4642 / 100)

 df <- seq_len(PAGES) |>
  purrr::map(\(i) {
    li <- qiitr::qiita_get_items(
      tag_id = "r",
      per_page = 100, # range: 1-100
      page_offset = (i - 1), # 1 + page_offset < 100
      page_limit = 1
    )
    if (i %% 16 == 0) {
      rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i))
      Sys.sleep(60 * 16)
    }
    return(to_df(li))
  }, .progress = TRUE) |>
  purrr::list_rbind()

 arrow::write_parquet(df, here::here("data/qiita.parquet"))
diff --git a/quanteda.R b/quanteda.R
 tbl <- readxl::read_xlsx("Downloads/koe.xlsx") |>
  tibble::rowid_to_column(var = "doc_id") |>
  dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "NA"))) |>
  tidyr::drop_na()

 tbl |>
  dplyr::select(Region, Sex, Age, Satis) |>
  DataExplorer::plot_bar()

 tbl <- tbl |>
  dplyr::mutate(
    doc_id = factor(doc_id),
    Region = factor(Region),
    Sex = factor(Sex),
    Age = stringi::stri_trans_nfkc(Age),
    Age = dplyr::case_match(
      Age,
      c("10代", "20代") ~ "20",
      "30代" ~ "30",
      "40代" ~ "40",
      "50代" ~ "50",
      c("60代", "70代") ~ "60"
    ),
    Age = factor(Age),
    Satis = stringr::str_detect(Satis, "満足"),
    Opinion = audubon::strj_normalize(Opinion) |>
      stringr::str_replace_all("[[:number:]]+", "N")
  )

 summary(tbl)

 df <- tbl |>
  dplyr::select(!Region) |>
  gibasa::tokenize(Opinion) |>
  gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  gibasa::mute_tokens(
    (!POS1 %in% c("名詞", "形容詞")) |
    (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
  ) |>
  gibasa::pack()

 corp <- df |>
  dplyr::right_join(
    tbl |>
      dplyr::mutate(Attr = paste(Age, Sex, sep = "_"), .keep = "unused"),
    by = "doc_id"
  ) |>
  quanteda::corpus()

 require(ca)

 cam <- corp |>
  quanteda::tokens(what = "fastestword") |>
  quanteda::dfm() |>
  quanteda::dfm_group(groups = quanteda::docvars(corp, "Attr")) |>
  quanteda::dfm_trim(min_termfreq = 10, max_termfreq = 50) |>
  quanteda.textmodels::textmodel_ca()

 plot(cam)

 require(ggplot2)

 dplyr::bind_rows(
  as.data.frame(cam$rowcoord),
  as.data.frame(cam$colcoord)
 ) |>
  tibble::rownames_to_column() |>
  dplyr::mutate(shape = stringr::str_detect(rowname, "_")) |>
  ggplot(aes(Dim1, Dim2, shape = shape, colour = shape)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dashed") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") +
  geom_point() +
  ggrepel::geom_text_repel(aes(label = rowname), max.overlaps = 20) +
  theme_light() +
  theme(legend.position = "none")

 df <- tbl |>
  dplyr::select(!Region) |>
  gibasa::tokenize(Opinion) |>
  gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |>
  dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |>
  gibasa::mute_tokens(
    (!POS1 %in% c("名詞", "形容詞")) |
      (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行"))
  ) |>
  dtplyr::lazy_dt() |>
  dplyr::group_by(doc_id) |>
  dplyr::mutate(term1 = token, term2 = dplyr::lead(term1)) |>
  dplyr::add_count(term1, term2, name = "cooc") |>
  dplyr::ungroup() |>
  dplyr::select(doc_id, token, term1, term2, cooc)

 biterms <- df |>
  dplyr::select(doc_id, term1, term2, cooc) |>
  tidyr::drop_na() |>
  dplyr::as_tibble()

 model <- df |>
  dplyr::select(doc_id, token) |>
  dplyr::as_tibble() |>
  BTM::BTM(k = 5, background = TRUE, biterms = biterms)

 plot(model, top_n = 20)
diff --git a/R-benchmark-25.R b/R-benchmark-25.R
 # R Benchmark 2.5 (06/2008) [Simon Urbanek]
 # version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro
 # R Benchmark 2.4 (06/2008) [Simon Urbanek]
 # version 2.4 adapted to more recent Matrix package
 # R Benchmark 2.3 (21 April 2004)
 # Warning: changes are not carefully checked yet!
 # version 2.3 adapted to R 1.9.0
 # Many thanks to Douglas Bates ([email protected]) for improvements!
 # version 2.2 adapted to R 1.8.0
 # version 2.1 adapted to R 1.7.0
 # version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2
 # using the standard ATLAS library (Rblas.dll)
 # on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro

 # revised and optimized for R v. 1.5.x, 8 June 2002
 # Requires additionnal libraries: Matrix, SuppDists
 # Author : Philippe Grosjean
 # eMail  : [email protected]
 # Web    : http://www.sciviews.org
 # License: GPL 2 or above at your convenience (see: http://www.gnu.org)
 #
 # Several tests are adapted from the Splus Benchmark Test V. 2
 # by Stephan Steinhaus ([email protected])
 # Reference for Escoufier's equivalents vectors (test III.5):
 # Escoufier Y., 1970. Echantillonnage dans une population de variables
 # aleatoires r?elles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47.
 #
 # type source("c:/<dir>/R2.R") to start the test

 runs <- 3			# Number of times the tests are executed
 times <- rep(0, 15); dim(times) <- c(5,3)
 require(Matrix)		# Optimized matrix operations
 #require(SuppDists)	# Optimized random number generators
 #Runif <- rMWC1019	# The fast uniform number generator
 Runif <- runif
 # If you don't have SuppDists, you can use: Runif <- runif
 #a <- rMWC1019(10, new.start=TRUE, seed=492166)	# Init. the generator
 #Rnorm <- rziggurat	# The fast normal number generator
 # If you don't have SuppDists, you can use: Rnorm <- rnorm
 #b <- rziggurat(10, new.start=TRUE)	# Init. the generator
 Rnorm <- rnorm
 #remove("a", "b")
 options(object.size=100000000)

 cat("\n\n   R Benchmark 2.5\n")
 cat("   ===============\n")
 cat(c("Number of times each test is run__________________________: ", runs))
 cat("\n\n")


 cat("   I. Matrix calculation\n")
 cat("   ---------------------\n")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (1)
 cumulate <- 0; a <- 0; b <- 0
 for (i in 1:runs) {
  invisible(gc())
  timing <- system.time({
    a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500);
    b <- t(a);
    dim(b) <- c(1250, 5000);
    a <- t(b)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[1, 1] <- timing
 cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (2)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500));
  invisible(gc())
  timing <- system.time({
    b <- a^1000
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[2, 1] <- timing
 cat(c("2400x2400 normal distributed random matrix ^1000____ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (3)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- Rnorm(7000000)
  invisible(gc())
  timing <- system.time({
    b <- sort(a, method="quick")	# Sort is modified in v. 1.5.x
    # And there is now a quick method that better competes with other packages!!!
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[3, 1] <- timing
 cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (4)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800)
  invisible(gc())
  timing <- system.time({
    b <- crossprod(a)		# equivalent to: b <- t(a) %*% a
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[4, 1] <- timing
 cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (5)
 cumulate <- 0; c <- 0; qra <-0
 for (i in 1:runs) {
  a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000)))
  b <- as.double(1:2000)
  invisible(gc())
  timing <- system.time({
    c <- solve(crossprod(a), crossprod(a,b))
  })[3]
  cumulate <- cumulate + timing

  # This is the old method
  #a <- Rnorm(600*600); dim(a) <- c(600,600)
  #b <- 1:600
  #invisible(gc())
  #timing <- system.time({
  #  qra <- qr(a, tol = 1e-7);
  #  c <- qr.coef(qra, b)
  #  #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients
  #})[3]
  #cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[5, 1] <- timing
 cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing, "\n"))
 remove("a", "b", "c", "qra")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 times[ , 1] <- sort(times[ , 1])
 cat("                      --------------------------------------------\n")
 cat(c("                 Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n"))

 cat("   II. Matrix functions\n")
 cat("   --------------------\n")
 if (R.Version()$os == "Win32") flush.console()

 # (1)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- Rnorm(2400000)
  invisible(gc())
  timing <- system.time({
    b <- fft(a)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[1, 2] <- timing
 cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (2)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- array(Rnorm(600*600), dim = c(600, 600))
  # Only needed if using eigen.Matrix(): Matrix.class(a)
  invisible(gc())
  timing <- system.time({
  	b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value
  	# Rem: on my machine, it is faster than:
  #	 b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value
  #	 b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value
  #  b <- eigen.Matrix(a, vectors = F)$Value
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[2, 2] <- timing
 cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (3)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500)
  #Matrix.class(a)
  invisible(gc())
  timing <- system.time({
    #b <- determinant(a, logarithm=F)
    # Rem: the following is slower on my computer!
    # b <- det.default(a)
    b <- det(a)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[3, 2] <- timing
 cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (4)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000),
                       Dim = as.integer(c(3000, 3000))))
  invisible(gc())
  #a <- Rnorm(900*900); dim(a) <- c(900, 900)
  #a <- crossprod(a, a)
  timing <- system.time({
    b <- chol(a)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[4, 2] <- timing
 cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (5)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600)))
  invisible(gc())
  #a <- Rnorm(400*400); dim(a) <- c(400, 400)
  timing <- system.time({
  #  b <- qr.solve(a)
    # Rem: a little faster than
    b <- solve(a)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[5, 2] <- timing
 cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 times[ , 2] <- sort(times[ , 2])
 cat("                      --------------------------------------------\n")
 cat(c("                Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n"))

 cat("   III. Programmation\n")
 cat("   ------------------\n")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (1)
 cumulate <- 0; a <- 0; b <- 0; phi <- 1.6180339887498949
 for (i in 1:runs) {
  a <- floor(Runif(3500000)*1000)
  invisible(gc())
  timing <- system.time({
    b <- (phi^a - (-phi)^(-a))/sqrt(5)
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[1, 3] <- timing
 cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing, "\n"))
 remove("a", "b", "phi")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (2)
 cumulate <- 0; a <- 3000; b <- 0
 for (i in 1:runs) {
  invisible(gc())
  timing <- system.time({
    b <- rep(1:a, a); dim(b) <- c(a, a);
    b <- 1 / (t(b) + 0:(a-1))
    # Rem: this is twice as fast as the following code proposed by R programmers
    # a <- 1:a; b <- 1 / outer(a - 1, a, "+")
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[2, 3] <- timing
 cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing, "\n"))
 remove("a", "b")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (3)
 cumulate <- 0; c <- 0
 gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}}
 for (i in 1:runs) {
  a <- ceiling(Runif(400000)*1000)
  b <- ceiling(Runif(400000)*1000)
  invisible(gc())
  timing <- system.time({
    c <- gcd2(a, b)                            # gcd2 is a recursive function
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[3, 3] <- timing
 cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing, "\n"))
 remove("a", "b", "c", "gcd2")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (4)
 cumulate <- 0; b <- 0
 for (i in 1:runs) {
  b <- rep(0, 500*500); dim(b) <- c(500, 500)
  invisible(gc())
  timing <- system.time({
  	# Rem: there are faster ways to do this
  	# but here we want to time loops (220*220 'for' loops)!
    for (j in 1:500) {
      for (k in 1:500) {
        b[k,j] <- abs(j - k) + 1
      }
    }
  })[3]
  cumulate <- cumulate + timing
 }
 timing <- cumulate/runs
 times[4, 3] <- timing
 cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing, "\n"))
 remove("b", "j", "k")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 # (5)
 cumulate <- 0; p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0;
 x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0
 # Calculate the trace of a matrix (sum of its diagonal elements)
 Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)}
 for (i in 1:runs) {
  x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45)
  invisible(gc())
  timing <- system.time({
    # Calculation of Escoufier's equivalent vectors
    p <- ncol(x)
    vt <- 1:p                                  # Variables to test
    vr <- NULL                                 # Result: ordered variables
    RV <- 1:p                                  # Result: correlations
    vrt <- NULL
    for (j in 1:p) {                           # loop on the variable number
      Rvmax <- 0
      for (k in 1:(p-j+1)) {                   # loop on the variables
        x2 <- cbind(x, x[,vr], x[,vt[k]])
        R <- cor(x2)                           # Correlations table
        Ryy <- R[1:p, 1:p]
        Rxx <- R[(p+1):(p+j), (p+1):(p+j)]
        Rxy <- R[(p+1):(p+j), 1:p]
        Ryx <- t(Rxy)
        rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation
        if (rvt > Rvmax) {
          Rvmax <- rvt                         # test of RV
          vrt <- vt[k]                         # temporary held variable
        }
      }
      vr[j] <- vrt                             # Result: variable
      RV[j] <- Rvmax                           # Result: correlation
      vt <- vt[vt!=vr[j]]                      # reidentify variables to test
    }
  })[3]
  cumulate <- cumulate + timing
 }
 times[5, 3] <- timing
 cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing, "\n"))
 remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k")
 remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace")
 if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

 times[ , 3] <- sort(times[ , 3])
 cat("                      --------------------------------------------\n")
 cat(c("                Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n"))

 cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n"))
 cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n"))
 remove("cumulate", "timing", "times", "runs", "i")
 cat("                      --- End of test ---\n\n")
diff --git a/tidymodels-demo.R b/tidymodels-demo.R
 suppressPackageStartupMessages({
  require(tidymodels)
 })

 ames_split <- initial_split(modeldata::ames, strata = Sale_Price)
 ames_train <- training(ames_split)
 ames_test <- testing(ames_split)

 ames_rec <- recipe(Sale_Price ~ ., data = ames_train) |>
  step_mutate(Sale_Price = log1p(Sale_Price)) |>
  step_nzv(all_predictors()) |>
  step_select(
    all_outcomes(),
    all_numeric_predictors(),
    !starts_with("Year"), !ends_with("tude")
  ) |>
  step_normalize(all_numeric_predictors())

 # injection
 # https://parsnip.tidymodels.org/reference/model_spec.html
 num_trees <- 1200L

 ames_spec <- rand_forest(mtry = tune(), trees = !!num_trees) |>
  set_mode("regression") |>
  set_engine("ranger") |>
  set_args(num.threads = !!parallelly::availableCores(omit = 1), seed = 1234)

 str(ames_spec)

 base_wf <-
  workflow() |>
  add_recipe(ames_rec) |>
  add_model(
    rand_forest(trees = !!num_trees) |>
      set_engine("ranger") |>
      set_mode("regression") |>
      set_args(num.threads = !!parallelly::availableCores(omit = 1)),
    formula = expm1(Sale_Price) ~ .
  ) |>
  fit(ames_train)

 augment(
  extract_fit_parsnip(base_wf),
  new_data = prep(ames_rec) |> bake(new_data = ames_test)
 ) |>
  dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
  metrics(truth = Sale_Price, estimate = .pred)


 ames_grid <- workflow() |>
  add_recipe(ames_rec) |>
  add_model(ames_spec, formula = Sale_Price ~ .) |>
  tune_grid(
    resamples = vfold_cv(ames_train, v = 3),
    grid = grid_latin_hypercube(
      extract_parameter_set_dials(ames_spec) |>
        finalize(prep(ames_rec) |> bake(new_data = NULL)),
      size = 10
    ),
    control = control_grid(verbose = FALSE),
    metrics = metric_set(rmse)
  )

 select_best(ames_grid)
 show_best(ames_grid)

 best_wf <-
  workflow() |>
  add_recipe(ames_rec) |>
  add_model(ames_spec, formula = expm1(Sale_Price) ~ .) |>
  finalize_workflow(select_best(ames_grid))

 best_fit <- last_fit(best_wf, ames_split)

 collect_predictions(best_fit)

 collect_predictions(best_fit) |>
  dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す
  metrics(truth = Sale_Price, estimate = .pred)

 # model formula
 # https://github.com/tidymodels/parsnip/blob/8f13c1c41ce603261f25af64694c253f618fa999/R/model_formula.R
 linear_reg(penalty = 1.0) |>
  set_engine("glmnet") |>
  set_mode("regression")


 ### baritsu
 suppressPackageStartupMessages({
  require(tidymodels)
  require(baritsu)
 })

 #### classification

 data("penguins", package = "modeldata")
 data_split <- initial_split(penguins, strata = species, prop = .7)
 penguins_train <- training(data_split)
 rec <-
  recipe(
    species ~ .,
    data = penguins_train
  ) |>
  step_impute_mode(all_nominal()) |>
  step_impute_median(all_numeric_predictors()) |>
  step_zv(all_numeric_predictors()) |>
  step_scale(all_numeric_predictors()) |>
  step_dummy(all_nominal_predictors())

 spec <- svm_linear(
  margin = tune()
 ) |>
  set_engine("baritsu", penalty = tune()) |>
  set_mode("classification")

 wf <- workflow() |>
  add_recipe(rec) |>
  add_model(spec)

 wf_fit <- wf |>
  tune_grid(
    resamples = vfold_cv(penguins_train, v = 5, strata = species),
    grid = grid_max_entropy(
      svm_margin(),
      penalty(range = c(-1, 0)),
      size = 5
    ),
    metrics = metric_set(f_meas),
    control = control_grid(verbose = FALSE)
  )

 show_best(wf_fit)

 best_wf_fit <- wf |>
  finalize_workflow(select_best(wf_fit)) |>
  last_fit(data_split, metrics = metric_set(f_meas))

 collect_metrics(best_wf_fit)


 #### regression

 pkgload::load_all(export_all = FALSE)

 ames <- modeldata::ames |>
  dplyr::slice_sample(n = 100) |>
  dplyr::select(Sale_Price, Lot_Area, Total_Bsmt_SF, First_Flr_SF)
 data_split <- rsample::initial_split(ames)
 ames_train <- rsample::training(data_split)
 ames_test <- rsample::testing(data_split)
 rec <-
  recipes::recipe(
    Sale_Price ~ .,
    data = ames_train
  ) |>
  recipes::step_scale(recipes::all_numeric_predictors())

 spec <- lars(
    penalty_L1 = tune::tune(),
    penalty_L2 = tune::tune()
  ) |>
    parsnip::set_engine("baritsu") |>
    parsnip::set_mode("regression")

 wf <-
  workflows::workflow() |>
  workflows::add_recipe(rec) |>
  workflows::add_model(spec)

 wf_fit <- wf |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(ames_train, v = 5),
    grid = dials::grid_max_entropy(
      dials::penalty_L1(range = c(-10, 0)),
      dials::penalty_L2(range = c(-10, 0)),
      size = 10
    ),
    metrics = yardstick::metric_set(yardstick::rmse),
    control = tune::control_grid(verbose = TRUE)
  )


 penguins <- modeldata::penguins
 data_split <- rsample::initial_split(penguins, strata = "species")
 penguins_train <- rsample::training(data_split)
 penguins_test <- rsample::testing(data_split)
 rec <-
  recipes::recipe(
    species ~ .,
    data = penguins_train
  ) |>
  # recipes::step_select(recipes::all_outcomes(), recipes::all_numeric_predictors()) |>
  recipes::step_impute_mode(recipes::all_nominal()) |>
  recipes::step_impute_median(recipes::all_numeric_predictors()) |>
  recipes::step_scale(recipes::all_numeric_predictors())

 my_dat <- recipes::prep(rec) |> recipes::bake(new_data = NULL)
 my_dat2 <- recipes::prep(rec) |> recipes::bake(new_data = penguins_test)

 wf_fit <- wf |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(penguins_train, v = 3),
    grid = dials::grid_max_entropy(
      dials::penalty(),
      size = 5
    ),
    metrics = yardstick::metric_set(yardstick::f_meas),
    control = tune::control_grid(verbose = TRUE)
  )
diff --git a/whisper.R b/whisper.R
 require(audio.whisper)

 model <- whisper("models/ggml-base.bin")
 trans <- predict(model, "output.wav", language = "ja")

 arrow::write_parquet(trans$tokens, "output.wav.parquet")
	akita <- jisx0402::municipality \|>
	dplyr::filter(
	pref_code == "05",
	is.na(end_date),
	!stringr::str_ends(name, "郡")
	) \|>
	dplyr::mutate(
	muni_code = paste0(pref_code, city_code),
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	start_year = lubridate::year(start_date)
	) \|>
	dplyr::select(muni_code, name, start_year) \|>
	dplyr::arrange(desc(start_year))

	year <- max(akita$start_year, na.rm = TRUE)

	dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") \|>
	dplyr::filter(.data$year >= .env$year) \|>
	dplyr::right_join(akita, by = c("code" = "muni_code"))

	require(ggplot2)

	dat \|>
	dplyr::filter(name == "秋田市", age == "all") \|>
	ggplot(aes(year, population)) +
	geom_line(aes(colour = sex)) +
	scale_y_log10()

	ga <- jisx0402::jptopography("designated") \|>
	dplyr::filter(stringr::str_starts(muni_code, "05")) \|>
	rlang::as_function(
	~ dplyr::mutate(.,
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	area = units::set_units(sf::st_area(.), km^2)
	)
	)() \|>
	dplyr::right_join(
	dplyr::filter(dat, age == "all"),
	by = c("muni_code" = "code")
	) \|>
	dplyr::group_by(sex) \|>
	dplyr::mutate(
	denst = round(population / area),
	rank = dplyr::percent_rank(denst) * 10
	) \|>
	dplyr::ungroup() \|>
	ggplot() +
	geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
	facet_wrap(~ sex) +
	labs(
	title = "Rank of Population Density in Akita, throughout 2006-2023",
	subtitle = "Year: {frame_time}",
	caption = paste(
	"地図データ：「国土数値情報行政区域データ」（国土交通省）",
	"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
	sep = "\n"
	)
	) +
	theme_light() +
	gganimate::transition_time(year)

	gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
	gganimate::anim_save("akita-denst1.mp4", path = "Videos")
	tokyo <- jisx0402::municipality \|>
	dplyr::filter(
	pref_code == "13",
	is.na(end_date),
	!stringr::str_ends(name, "郡\|(特別区)\|(支庁)")
	) \|>
	dplyr::mutate(
	muni_code = paste0(pref_code, city_code),
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	start_year = lubridate::year(start_date)
	) \|>
	dplyr::select(muni_code, name, start_year) \|>
	dplyr::arrange(desc(start_year))

	year <- max(tokyo$start_year, na.rm = TRUE)

	dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") \|>
	dplyr::filter(.data$year >= .env$year) \|>
	dplyr::right_join(tokyo, by = c("code" = "muni_code"))

	require(ggplot2)

	dat \|>
	dplyr::filter(age == "all", stringr::str_detect(name, "区"),
	year > 2005L) \|>
	ggplot(aes(year, population)) +
	geom_line(aes(colour = sex)) +
	facet_wrap(~ name) +
	scale_y_log10()

	ga <- jisx0402::jptopography("all", resolution = 1) \|>
	dplyr::filter(stringr::str_starts(muni_code, "13")) \|>
	rlang::as_function(
	~ dplyr::mutate(.,
	muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)),
	area = units::set_units(sf::st_area(.), km^2)
	)
	)() \|>
	dplyr::right_join(
	dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")),
	by = c("muni_code" = "code")
	) \|>
	dplyr::group_by(sex) \|>
	dplyr::mutate(
	denst = round(population / area),
	rank = dplyr::percent_rank(denst) * 10
	) \|>
	dplyr::ungroup() \|>
	ggplot() +
	geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) +
	facet_wrap(~ sex) +
	labs(
	title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023",
	subtitle = "Year: {frame_time}",
	caption = paste(
	"地図データ：「国土数値情報行政区域データ」（国土交通省）",
	"https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成",
	sep = "\n"
	)
	) +
	theme_light() +
	gganimate::transition_time(year, range = c(2006L, 2023L))

	gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer())
	gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos")
	#' @export
	accelerate <- function(text = "精神を加速させろ") {
	r"(
	∩＿∩
	／＼／＼
	\|（ﾟ）=（ﾟ）\|
	\|　　●_●　 \|
	/　　　　　　ヽ
	r⌒\| 〃 ------ ヾ　\|
	/　　i／　\|＿二＿＿ノ
	./　／　　/　　　　　　) 　　　 {text}
	./ ／　　／　　　　　/／
	/　　　.／　　　　　/￣
	.ヽ､__.／　　　　／ ⌒ヽ
	r　　　／　　　　\|
	/　　　　　　　　　ﾉ
	/　　　　　 /　　　 /
	./　　　　／/　　／
	/.　　　.／ ./　／
	i　　　／　./ ／
	i　　./　.ノ.＾/
	i　 ./　　\|_／
	i　/
	／ /
	(_／
	)" \|> stringi::stri_replace_all_fixed("　", " ") \|> glue::glue()
	}

	#' @export
	qb <- function(text = "わけがわからないよ") {
	r"(
	\|＼　　　　　　／\|
	\|＼＼　　　　／／\|
	: 　,>　｀´￣｀´　<　 ′
	.　　　　　Ｖ　　　　　　Ｖ
	.　　　　　 i<　●　　　 ●　>i
	八　　､_,_, 　　八　　　{text}
	. 　　　 /　个 . ＿　＿ . 个 ',
	＿/ 　 il 　 ,'　　　 '.　 li　 ',＿_
	)" \|> stringi::stri_replace_all_fixed("　", " ") \|> glue::glue()
	}
	# get_lyrics.R -----

	#' Scrape lyrics from list
	#'
	#' @param df A tibble that comes of \code{get_lyrics_list}.
	#' @param file String; file name to append lyrircs.
	#' @param links String; column name of lyrics links.
	#' @returns `file` is returned invisibly.
	#' @export
	#' @examples
	#' \dontrun{
	#' csv_file <-
	#' aznyan::get_lyrics_list("23729") \|>
	#' aznyan::get_lyrics("23729.csv")
	#' tbl <-
	#' readr::read_csv(csv_file, col_names = F, col_types = "cccc___cDn") \|>
	#' dplyr::rename(
	#' title = X1,
	#' artist = X2,
	#' lyricist = X3,
	#' composer = X4,
	#' text = X8,
	#' released = X9,
	#' page_view = X10
	#' )
	#' }
	get_lyrics <- function(df, file, links = "link") {
	base_url <- "https://www.uta-net.com"
	links <- dplyr::pull(df, {{ links }})

	url <- paste(base_url, links, sep = "/")

	session <- polite::bow(base_url, force = FALSE)
	purrr::iwalk(url, function(q, itr) {
	html <- session %>%
	polite::nod(q) %>%
	polite::scrape()
	lyric_body <- html %>%
	rvest::html_element("#kashi_area") %>%
	rvest::html_text2()
	info <- html %>%
	rvest::html_element(".song-infoboard") %>%
	rvest::html_element(".detail") %>%
	rvest::html_text() %>%
	stringr::str_split("\\n") %>%
	unlist() %>%
	purrr::pluck(4) %>%
	stringr::str_extract_all(pattern = "[\\d,/]+") %>%
	unlist()

	data.frame(
	df[itr, ],
	lyric = lyric_body,
	released = info[1],
	page_viewed = info[2]
	) %>%
	dplyr::mutate(
	released = lubridate::as_date(.data$released),
	page_viewed = stringr::str_remove_all(.data$page_viewed, ",") %>%
	unlist() %>%
	as.numeric()
	) %>%
	readr::write_csv(file, append = TRUE, progress = FALSE)
	})
	invisible(file)
	}

	# lyrics_list.R -----

	#' Scrape table of lyrics list
	#'
	#' @param id String; substring xxx of 'https://www.uta-net.com/:type:/xxx/'.
	#' @param type String; one of "artist", "lyricist", or "composer".
	#' @returns tibble.
	#' @export
	get_lyrics_list <- function(id,
	type = c("artist", "lyricist", "composer")) {
	base_url <- "https://www.uta-net.com"
	type <- rlang::arg_match(type, c("artist", "lyricist", "composer"))

	url <- paste(base_url, type, id, "", sep = "/")
	session <-
	polite::bow(base_url, force = FALSE) %>%
	polite::nod(url)

	html <- session %>%
	polite::scrape()

	page_list <- html %>%
	rvest::html_element(".songlist-table-block") %>%
	rvest::html_element("tfoot") %>%
	rvest::html_text() %>%
	stringr::str_extract("([:number:]+)")

	purrr::map_dfr(seq.int(as.integer(page_list)), function(i) {
	html <- session %>%
	polite::nod(path = paste(url, "0", as.character(i), "", sep = "/")) %>%
	polite::scrape()
	tables <- html %>%
	rvest::html_elements(".songlist-table-block") %>%
	rvest::html_elements("table")
	df <- tables %>%
	rvest::html_table() %>%
	purrr::map_dfr(~ stats::na.omit(.))
	titles <- html %>%
	rvest::html_elements(".songlist-table-block") %>%
	rvest::html_elements(".songlist-title") %>%
	rvest::html_text()
	links <- tables %>%
	rvest::html_elements(".sp-w-100") %>%
	rvest::html_elements("a") %>%
	rvest::html_attr("href") %>%
	purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/"))
	df %>%
	dplyr::slice_head(n = nrow(df) - 1) %>%
	dplyr::rename(
	text_lab = "\u66f2\u540d",
	artist = "\u6b4c\u624b\u540d",
	lyricist = "\u4f5c\u8a5e\u8005\u540d",
	composer = "\u4f5c\u66f2\u8005\u540d",
	leading = "\u6b4c\u3044\u51fa\u3057"
	) %>%
	dplyr::bind_cols(
	data.frame(title = titles, link = links, source_page = i)
	) %>%
	dplyr::select(
	.data$title,
	.data$artist,
	.data$lyricist,
	.data$composer,
	.data$leading,
	.data$link,
	.data$source_page
	)
	})
	}

	#' Search lyrics list by keyword
	#'
	#' @param keyword String; search phrase.
	#' @param sort String; one of "new", "popular", "title", or "artist".
	#' @return tibble
	#' @export
	search_lyrics_list <- function(keyword,
	sort = c("new", "popular", "title", "artist")) {
	base_url <- "https://www.uta-net.com"
	sort <- rlang::arg_match(sort, c("new", "popular", "title", "artist"))
	sort <- dplyr::case_when(
	sort == "new" ~ 6,
	sort == "popular" ~ 4,
	sort == "title" ~ 1,
	sort == "artist" ~ 7,
	TRUE ~ 1
	)

	url <- paste(base_url, "search", "", sep = "/")
	session <-
	polite::bow(base_url, force = FALSE) %>%
	polite::nod(url)

	html <- session %>%
	polite::scrape(query = list(
	Keyword = stringr::str_trim(keyword),
	Aselect = "2",
	Bselect = "3",
	sort = sort
	))
	page_list <- html %>%
	rvest::html_element("#songlist-sort-paging") %>%
	rvest::html_text() %>%
	stringr::str_extract("([:number:]+)")

	purrr::map_dfr(seq.int(as.integer(page_list)), function(i) {
	html <- session %>%
	polite::scrape(query = list(
	Keyword = enc2utf8(keyword),
	Aselect = "2",
	Bselect = "3",
	sort = sort,
	pnum = i
	))
	tables <- html %>%
	rvest::html_elements(".songlist-table-block") %>%
	rvest::html_elements("table")
	df <- tables %>%
	rvest::html_table() %>%
	purrr::map_dfr(~ na.omit(.))
	titles <- html %>%
	rvest::html_elements(".songlist-table-block") %>%
	rvest::html_elements(".songlist-title") %>%
	rvest::html_text()
	links <- tables %>%
	rvest::html_elements(".sp-w-100") %>%
	rvest::html_elements("a") %>%
	rvest::html_attr("href") %>%
	purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/"))
	df %>%
	dplyr::slice_head(n = nrow(df) - 1) %>%
	dplyr::rename(
	text_lab = "\u66f2\u540d",
	artist = "\u6b4c\u624b\u540d",
	lyricist = "\u4f5c\u8a5e\u8005\u540d",
	composer = "\u4f5c\u66f2\u8005\u540d",
	leading = "\u6b4c\u3044\u51fa\u3057"
	) %>%
	dplyr::bind_cols(
	data.frame(title = titles, link = links, source_page = i)
	) %>%
	dplyr::select(
	.data$title,
	.data$artist,
	.data$lyricist,
	.data$composer,
	.data$leading,
	.data$link,
	.data$source_page
	)
	})
	}
	```{r}
	#\| label: help-text
	#\| echo: false
	#\| output: asis
	text <- withr::with_options(list(useFancyQuotes = FALSE),
	capture.output({
	tools::Rd2HTML(
	tools::Rd_db("base")[["use.Rd"]],
	outputEncoding = "UTF-8"
	)
	}) \|>
	paste0(collapse = "\n") \|>
	rvest::read_html() \|>
	rvest::html_nodes("main") \|>
	rvest::html_text()
	)
	text \|>
	stringr::str_split("\n") \|>
	unlist(use.names = FALSE) \|>
	rlang::as_function(~ paste("> ", .))() \|>
	commonmark::markdown_commonmark() \|>
	cat()
	```

	dat <-
	readr::read_csv("data/lower_care.csv") \|>
	dplyr::mutate(
	issue = factor(issue),
	speaker = factor(speaker),
	speakerGroup = factor(speakerGroup),
	speech = stringi::stri_replace_all_regex(speech, "\\s+", "") \|>
	stringi::stri_trans_nfkc()
	)

	# 名詞
	toks <- dat \|>
	dplyr::select(speechID, speech, speaker, speakerGroup) \|>
	gibasa::tokenize(
	speech,
	speechID,
	sys_dic = here::here("sudachidict/build")
	) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = c("POS1", "normalized_form", "reading_form")
	) \|>
	dplyr::mutate(
	reading_form = stringi::stri_replace_all_regex(
	reading_form,
	"[ァィゥェォャュョ]",
	""
	),
	mora = stringi::stri_length(reading_form),
	.by = doc_id
	)

	noun <- toks \|>
	dplyr::reframe(
	tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
	pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
	.by = doc_id
	) \|>
	dplyr::filter(
	pos == "名詞-助詞-名詞",
	stringi::stri_detect_fixed(tok, "-の-")
	) \|>
	dplyr::distinct(tok) \|>
	dplyr::pull(tok) \|>
	stringi::stri_extract_first_regex("^([[:alpha:]]+)")

	meishi <- toks \|>
	dplyr::count(doc_id, token, mora) \|>
	tidytext::bind_tf_idf(token, doc_id, n) \|>
	dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) \|>
	dplyr::filter(dplyr::percent_rank(tf_idf) > .80) \|>
	dplyr::distinct(token, mora)

	readr::write_csv(
	meishi,
	"vocab_n.csv"
	)

	# 述部
	ch <-
	stringi::stri_split_boundaries(
	dat$speech,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@ld=auto;lw=phrase"
	)
	) \|>
	purrr::map(
	~ stringi::stri_subset_regex(., "。")
	) \|>
	unlist() \|>
	unique() \|>
	stringi::stri_replace_all_fixed("。", "") \|>
	purrr::keep(~ stringi::stri_length(.) > 2)

	len <-
	gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = "reading_form"
	) \|>
	gibasa::as_tokens(reading_form, pos_field = NULL) \|>
	purrr::map(
	~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") \|>
	stringi::stri_length() \|>
	sum()
	) \|>
	unlist()

	jutsubu <-
	tibble::tibble(
	word = ch,
	mora = len
	) \|>
	dplyr::filter(
	!gibasa::is_blank(word),
	mora > 4, mora < 8
	) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	jutsubu,
	"vocab_j.csv"
	)

	# 修飾部
	shushoku <- toks \|>
	dplyr::filter(
	POS1 %in% c("形容詞")
	) \|>
	dplyr::distinct(normalized_form, .keep_all = TRUE) \|>
	dplyr::select(token, mora) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	shushoku,
	"vocab_s.csv"
	)

	dat <-
	readxl::read_xlsx(
	"data/39.xlsx",
	col_names = c(
	"title", "artist", "lyricist", "composer",
	"leading", "link", "source", "text",
	"release", "view"
	),
	na = "NA"
	) \|>
	dplyr::reframe(
	text = stringi::stri_split_regex(text, "[\\s]{2,}") \|>
	unlist() \|>
	stringi::stri_replace_all_regex("[\\n]+", "。") \|>
	stringi::stri_replace_all_regex("[「」（）\\(\\)]", "") \|>
	stringi::stri_trans_tolower(),
	.by = link
	)

	# 名詞
	toks <- dat \|>
	gibasa::tokenize(
	text,
	link,
	sys_dic = here::here("sudachidict/build")
	) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = c("POS1", "normalized_form", "reading_form")
	) \|>
	dplyr::mutate(
	reading_form = stringi::stri_replace_all_regex(
	reading_form,
	"[ァィゥェォャュョ]",
	""
	),
	mora = stringi::stri_length(reading_form),
	.by = doc_id
	)

	noun <- toks \|>
	dplyr::reframe(
	tok = gibasa::ngram_tokenizer(3)(token, sep = "-"),
	pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"),
	.by = doc_id
	) \|>
	dplyr::filter(
	pos == "名詞-助詞-名詞",
	stringi::stri_detect_fixed(tok, "-の-")
	) \|>
	dplyr::distinct(tok) \|>
	dplyr::pull(tok) \|>
	stringi::stri_extract_first_regex("^([[:alpha:]]+)")

	meishi <- toks \|>
	dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) \|>
	dplyr::distinct(token, mora)

	readr::write_csv(
	meishi,
	"vocab_n.csv"
	)

	# 述部
	ch <-
	stringi::stri_split_boundaries(
	dat$text,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@ld=auto;lw=phrase"
	)
	) \|>
	purrr::map(
	~ stringi::stri_subset_regex(., "。")
	) \|>
	unlist() \|>
	unique() \|>
	stringi::stri_replace_all_fixed("。", "") \|>
	purrr::keep(~ stringi::stri_length(.) > 2)

	len <-
	gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) \|>
	gibasa::prettify(
	into = gibasa::get_dict_features("sudachi"),
	col_select = "reading_form"
	) \|>
	gibasa::as_tokens(reading_form, pos_field = NULL) \|>
	purrr::map(
	~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") \|>
	stringi::stri_length() \|>
	sum()
	) \|>
	unlist()

	jutsubu <-
	tibble::tibble(
	word = ch,
	mora = len
	) \|>
	dplyr::filter(
	!gibasa::is_blank(word),
	mora > 2, mora < 8
	) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	jutsubu,
	"vocab_j.csv"
	)

	# 修飾部
	shushoku <- toks \|>
	dplyr::filter(
	POS1 %in% c("形容詞", "副詞")
	) \|>
	dplyr::distinct(normalized_form, .keep_all = TRUE) \|>
	dplyr::select(token, mora) \|>
	dplyr::arrange(mora)

	readr::write_csv(
	shushoku,
	"vocab_s.csv"
	)
	#' Environment for internal use
	#'
	#' @noRd
	#' @keywords internal
	.env <- rlang::env(server i= NULL)

	#' @noRd
	reset_encoding <- function(chr, encoding = "UTF-8") {
	Encoding(chr) <- encoding
	return(chr)
	}

	#' Call lattice command
	#'
	#' A debug tool of tokenize process outputs a lattice in graphviz dot format.
	#'
	#' @param sentences Character vector.
	#' @param ... Other arguments are passed to \code{DiagrammeR::grViz}.
	#'
	#' @export
	get_lattice <- function(sentences, ...) {
	senteces <- stringi::stri_omit_na(sentences)
	dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout
	DiagrammeR::grViz(reset_encoding(dot), ...)
	}

	#' Lanch Kagome Server
	#'
	#' Start or kill Kagome server process.
	#' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary).
	#' @return Logical value (whether or not the kagome server is alive?) is returned invisibly.
	#'
	#' @export
	lanch_server <- function(dict = c("ipa", "uni")) {
	p <- rlang::env_get(.env, "server")
	if (is.null(p)) {
	dict <- rlang::arg_match(dict)
	p <- processx::process$new("kagome", c("server", "-dict", dict))

	rlang::env_bind(.env, server = p)
	} else {
	if (p$is_alive()) {
	kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?")
	}
	if (kill) {
	p$kill()
	rlang::env_bind(.env, server = NULL)
	}
	}
	return(invisible(p$is_alive()))
	}

	#' Send a HEAD request to Kagome server
	#'
	#' @param url URL Character scalar.
	#' @return httr2 response is returned invisibly.
	#'
	#' @export
	ping <- function(url = "http://localhost:6060") {
	resp <-
	httr2::request(url) \|>
	httr2::req_method("HEAD") \|>
	httr2::req_perform()
	return(invisible(resp))
	}

	#' Put request to tokenize API
	#'
	#' @param sentences Character vector to be analyzed.
	#' @param url URL of Kagome server.
	#' @param mode One of `normal`, `search` or `extended`.
	#' @return tibble
	#'
	#' @export
	tokenize <- function(sentences,
	url = "http://localhost:6060/tokenize",
	mode = c("normal", "search", "extended")) {
	mode <- rlang::arg_match(mode)

	sentences <-
	stringi::stri_omit_na(sentences) \|>
	stringi::stri_split_boundaries(type = "sentence") \|>
	purrr::flatten_chr()

	resps <-
	furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) \|>
	dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) \|>
	dplyr::mutate_at(c("doc_id", "class"), as.factor) \|>
	tibble::as_tibble()

	return(resps)
	}

	#' @noRd
	tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") {
	resp <-
	httr2::request(url) \|>
	httr2::req_body_json(list(
	sentence = msg,
	mode = mode
	)) \|>
	httr2::req_method("PUT") \|>
	httr2::req_error(function(resp) httr2::resp_status(resp) > 400) \|>
	httr2::req_perform()

	return(
	purrr::map_dfr(
	httr2::resp_body_json(resp)$tokens,
	~ data.frame(
	doc_id = idx,
	id = .$id,
	start = .$start,
	end = .$end,
	class = .$class,
	token = .$surface,
	POS1 = .$pos[[1]],
	POS2 = .$pos[[2]],
	POS3 = .$pos[[3]],
	POS4 = .$pos[[4]],
	Original = .$base_form,
	Yomi1 = .$reading,
	Yomi2 = .$pronunciation
	)
	)
	)
	}

	.on_unload <- function(ns) {
	p <- rlang::env_get(.env, "server")
	if (!is.null(p) && p$is_alive()) {
	p$kill()
	}
	}
	texts <- ldccr::read_aozora(
	"https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip",
	directory = tempdir()
	)

	sent <- readr::read_lines(texts) \|>
	gibasa::tokenize(split = TRUE) \|>
	gibasa::prettify() \|>
	dplyr::filter(!POS1 %in% c("記号")) \|>
	dplyr::group_by(doc_id, sentence_id) \|>
	dplyr::reframe(
	text = paste0(token, collapse = " ")
	) \|>
	dplyr::mutate(
	doc_id = doc_id,
	sentence_id = paste(doc_id, sentence_id, sep = "_")
	)

	### lexRankr ----

	tok <- sent \|>
	tidytext::unnest_tokens(
	token,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	)

	simil_df <- lexRankr::sentenceSimil(
	sentenceId = tok$sentence_id,
	token = tok$token,
	docId = tok$doc_id
	)

	top_n_sent <-
	lexRankr::lexRankFromSimil(
	simil_df$sent1,
	simil_df$sent2,
	simil = simil_df$similVal,
	n = 10,
	usePageRank = TRUE,
	continuous = TRUE,
	# threshold = 0.1,
	returnTies = FALSE
	)

	top_n_sent \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()

	### replace sentenceSimil ----

	mat <-
	sent \|>
	tidytext::unnest_tokens(
	term,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	) \|>
	dplyr::count(doc_id, term) \|>
	tidytext::bind_tf_idf(term, doc_id, n) \|>
	dplyr::mutate(tf_idf = n * (idf + 1)) \|>
	dplyr::inner_join(
	sent \|>
	tidytext::unnest_tokens(
	term,
	text,
	token = \(x) strsplit(x, " ", fixed = TRUE)
	) \|>
	dplyr::select(doc_id, sentence_id, term),
	by = c("doc_id" = "doc_id", "term" = "term")
	) \|>
	tidytext::cast_sparse(sentence_id, term, tf_idf)

	dt <- proxyC::simil(
	mat,
	method = "cosine",
	# min_simil = 0.1,
	# rank = 50,
	use_nan = FALSE
	)

	dt \|>
	tidytext:::tidy.dfm() \|>
	dplyr::rename(
	s1 = document,
	s2 = term,
	weight = count
	) \|>
	(\(simil_df) {
	lexRankr::lexRankFromSimil(
	simil_df$s1,
	simil_df$s2,
	simil = simil_df$weight,
	n = nrow(simil_df),
	usePageRank = TRUE,
	continuous = TRUE,
	# threshold = 0.1,
	returnTies = TRUE
	)
	})() \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()

	### replace lexRankFromSimil ----

	dt \|>
	tidytext:::tidy.dfm() \|>
	dplyr::rename(
	s1 = document,
	s2 = term,
	weight = count
	) \|>
	(\(simil_df) {
	pr <- simil_df \|>
	igraph::graph_from_data_frame(
	directed = FALSE
	) \|>
	igraph::page_rank(
	directed = FALSE,
	damping = .85
	) \|>
	purrr::pluck("vector")
	tibble::tibble(
	sentenceId = names(pr),
	value = unname(pr)
	)
	})() \|>
	dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) \|>
	dplyr::arrange(desc(value)) \|>
	dplyr::as_tibble()
	require(sudachir)
	require(apportita)

	strj_split_boundaries <- \(x) {
	stringi::stri_split_boundaries(
	x,
	opts_brkiter = stringi::stri_opts_brkiter(
	locale = "ja@lw=phrase;ld=auto" # auto \| loose \| normal \| strict \| anywhere
	)
	)
	}

	split_kugire <- \(x) {
	strj_split_boundaries(x) \|>
	purrr::map(\(elem) {
	len <- length(elem)
	if (len < 2) {
	return(NA_character_)
	} else {
	sapply(seq_len(len - 1), \(i) {
	s1 <- paste0(elem[1:i], collapse = "")
	s2 <- paste0(elem[(i + 1):len], collapse = "")
	paste(s1, s2, sep = "\t")
	})
	}
	}) \|>
	unlist()
	}

	sudachi <- sudachir::rebuild_tokenizer(mode = "C")
	form <- \(x) {
	unlist(sudachir::form(x, type = "normalized", pos = FALSE,
	instance = sudachi))
	}
	wrd <- \(conn, s1, s2) {
	purrr::map2_dbl(s1, s2, \(el1, el2) {
	el1 <- form(el1)
	el2 <- form(el2)
	apportita::calc_wrd(conn, el1, el2)
	})
	}

	conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude")
	dim(conn)


	### tanka72 ----
	dat <-
	readxl::read_xlsx("data/tanka/tanka72.xlsx") \|>
	dplyr::transmute(
	id = factor(id),
	body = audubon::strj_normalize(body) \|>
	stringr::str_remove_all("[^[:alnum:]]+"),
	author = factor(author)
	)

	ret <- dat \|>
	dplyr::reframe(
	id = id,
	phrase = split_kugire(body),
	author = author,
	.by = id
	) \|>
	tidyr::separate_wider_delim(
	phrase, delim = "\t", names = c("s1", "s2")
	)

	ret <- ret \|>
	dplyr::mutate(
	wrd = wrd(conn, s1, s2)
	)

	arrow::write_parquet(ret, "tanka72-wrd.parquet")

	### tweets ----
	tweets <-
	readr::read_csv(
	"data/shinabitanori-230622.csv.gz",
	col_names = c("id", "time", "body"),
	col_types = "ccc"
	) \|>
	dplyr::filter(
	!stringr::str_detect(body, "@\|(RT)"),
	stringr::str_length(body) > 14,
	stringr::str_length(body) < 30
	) \|>
	dplyr::slice_sample(n = 100) \|>
	dplyr::transmute(
	id = factor(id),
	body = audubon::strj_normalize(body) \|>
	ldccr::clean_url() \|>
	stringr::str_remove_all("[^[:alnum:]]+")
	) \|>
	dplyr::filter(
	!stringi::stri_isempty(body)
	)

	tweets <- tweets \|>
	dplyr::reframe(
	id = id,
	phrase = split_kugire(body),
	.by = id
	) \|>
	tidyr::separate_wider_delim(
	phrase, delim = "\t", names = c("s1", "s2")
	)

	tweets <- tweets \|>
	dplyr::mutate(
	wrd = wrd(conn, s1, s2)
	)

	arrow::write_parquet(tweets, "shinabitanori-wrd.parquet")


	close(conn)
	rm(conn)