Last active
December 29, 2022 08:27
-
-
Save tetlabo/a5b2ded77b41cbce7ba51f349865c3c3 to your computer and use it in GitHub Desktop.
歌ネットから歌詞をスクレイピングする
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 参考: 【GoogleColaboratory】歌ネット(Uta-Net)から歌詞をスクレイピングする https://zenn.dev/robes/articles/00e86185677fb5 | |
library(tidyverse) | |
library(httr) | |
library(rvest) | |
base_url <- "https://www.uta-net.com" | |
#urls <- "https://www.uta-net.com/artist/6636/" | |
# とりあえずAKB48と乃木坂46と櫻坂46を選択 (オッサンには区別がつかないですが...) | |
urls <- c("https://www.uta-net.com/artist/6636/0/1/", "https://www.uta-net.com/artist/6636/0/2/", "https://www.uta-net.com/artist/12550/", "https://www.uta-net.com/artist/29512/") | |
# User Agentを偽装する | |
pseudo_user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" | |
# データフレームの初期化 | |
df <- data.frame(id = numeric(), artist = character(), title = character(), lyric = character(), url = character()) | |
for (i in 1:length(urls)) { | |
res <- GET(urls[i], user_agent(pseudo_user_agent)) | |
html <- content(res) | |
links <- html %>% html_elements("td.sp-w-100") | |
for (link in links) { | |
href <- link %>% html_element("a") %>% html_attr("href") | |
song_url <- paste0(base_url, href, collapse = "") | |
song_res <- GET(song_url, user_agent(pseudo_user_agent)) | |
song_html <- content(song_res) | |
song_title <- song_html %>% html_element("h2") %>% html_text2() | |
artist_name <- song_html %>% html_element("h3") %>% html_text2() | |
lyric <- song_html %>% html_element("div#kashi_area") %>% html_text2() %>% str_replace_all("\\n+", " ") | |
tmp_df <- data.frame(id = i, artist = artist_name, title = song_title, lyric = lyric, url = song_url) | |
df <- df %>% bind_rows(tmp_df) | |
print(paste0("now processing '", artist_name, "', '", song_title, "'")) | |
Sys.sleep(2) | |
} | |
} | |
write_csv(df, "pops_lyrics.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment