Created
December 27, 2019 18:13
-
-
Save kiwiyou/6310599e0fd647e6c896f94b4569a47b to your computer and use it in GitHub Desktop.
네이버 한자사전 고사성어 크롤러
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "hanja-idiom" | |
version = "0.1.0" | |
authors = ["kiwiyou <[email protected]>"] | |
edition = "2018" | |
[dependencies] | |
reqwest = "0.9" | |
unhtml = "0.7" | |
unhtml_derive = "0.7" | |
serde = "1.0" | |
serde_derive = "1.0" | |
csv = "1.1" | |
rayon = "1.3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use rayon::prelude::*; | |
use reqwest::get; | |
use serde_derive::*; | |
use unhtml::{self, FromHtml}; | |
use unhtml_derive::*; | |
#[derive(FromHtml)] | |
#[html(selector = ".tab_submenu")] | |
struct OnsetTab { | |
#[html(selector = "li")] | |
onsets: Vec<Onset>, | |
} | |
#[derive(FromHtml)] | |
struct Onset; | |
#[derive(FromHtml)] | |
#[html(selector = ".sub_word")] | |
struct InitialTab { | |
#[html(selector = "span", attr = "inner")] | |
initials: Vec<String>, | |
} | |
#[derive(Debug, FromHtml)] | |
#[html(selector = "#content")] | |
struct WordList { | |
#[html(selector = "dd:not([class])", attr = "inner")] | |
readings: Vec<String>, | |
#[html(selector = "dd.meaning", attr = "inner")] | |
meanings: Vec<String>, | |
#[html(selector = ".paginate > *:not([class])", attr = "inner")] | |
pages: Vec<usize>, | |
#[html(selector = ".paginate > .next", attr = "inner")] | |
next: Option<String>, | |
} | |
fn parse_onset_tab() -> OnsetTab { | |
let html = get("https://hanja.dict.naver.com/category/idiom") | |
.unwrap() | |
.text() | |
.unwrap(); | |
let tab = OnsetTab::from_html(&html).unwrap(); | |
tab | |
} | |
fn parse_initial_tab(index: usize) -> InitialTab { | |
let query = format!("https://hanja.dict.naver.com/category/idiom?idx={}", index); | |
let html = get(&query).unwrap().text().unwrap(); | |
let tab = InitialTab::from_html(&html).unwrap(); | |
tab | |
} | |
fn parse_word_list<S: AsRef<str>>(initial: S, page: usize) -> WordList { | |
let query = format!( | |
"https://hanja.dict.naver.com/category/idiom?q={}&pageNo={}", | |
initial.as_ref(), | |
page | |
); | |
let html = get(&query).unwrap().text().unwrap(); | |
let tab = WordList::from_html(&html).unwrap(); | |
tab | |
} | |
#[derive(Debug, Clone, Serialize)] | |
#[serde(rename_all = "PascalCase")] | |
struct Idiom { | |
reading: String, | |
meaning: String, | |
} | |
fn crawl_for_initial<S: AsRef<str>>(initial: S) -> Vec<Idiom> { | |
let initial_ref = initial.as_ref(); | |
let sample_list = sample_for_initial(initial_ref); | |
sample_list | |
.par_iter() | |
.flat_map(|sample| { | |
let first = sample | |
.readings | |
.par_iter() | |
.zip(sample.meanings.par_iter()) | |
.map(|(reading, meaning)| Idiom { | |
reading: reading.clone(), | |
meaning: meaning.clone(), | |
}) | |
.collect::<Vec<_>>(); | |
let from_second = sample | |
.pages | |
.par_iter() | |
.skip(1) | |
.map(|page| parse_word_list(initial_ref, *page)) | |
.flat_map(|list| { | |
list.readings | |
.par_iter() | |
.zip(list.meanings.par_iter()) | |
.map(|(reading, meaning)| Idiom { | |
reading: reading.clone(), | |
meaning: meaning.clone(), | |
}) | |
.collect::<Vec<_>>() | |
}); | |
first | |
.par_iter() | |
.cloned() | |
.chain(from_second) | |
.collect::<Vec<_>>() | |
}) | |
.collect() | |
} | |
fn sample_for_initial<S: AsRef<str>>(initial: S) -> Vec<WordList> { | |
let initial_ref = initial.as_ref(); | |
let mut sample_page = 1; | |
let mut sample_list = Vec::new(); | |
loop { | |
let sample = parse_word_list(initial_ref, sample_page); | |
let is_none = sample.next.is_none(); | |
sample_list.push(sample); | |
if is_none { | |
break; | |
} | |
sample_page += 10; | |
} | |
sample_list | |
} | |
fn main() { | |
println!("[+] crawling onset list..."); | |
let onset_tab = parse_onset_tab(); | |
let initial_tabs = onset_tab | |
.onsets | |
.par_iter() | |
.enumerate() | |
.map(|(i, _)| parse_initial_tab(i)); | |
println!("[+] crawling idiom list..."); | |
let mut idioms = initial_tabs | |
.flat_map(|tab| { | |
tab.initials | |
.par_iter() | |
.flat_map(|initial| crawl_for_initial(initial)) | |
.collect::<Vec<_>>() | |
}) | |
.collect::<Vec<_>>(); | |
idioms.sort_unstable_by(|a, b| a.reading.partial_cmp(&b.reading).unwrap()); | |
println!("[+] saving idioms..."); | |
let mut writer = csv::WriterBuilder::new() | |
.delimiter(b',') | |
.quote_style(csv::QuoteStyle::NonNumeric) | |
.from_path("idiom.csv") | |
.unwrap(); | |
for idiom in idioms.iter() { | |
writer.serialize(idiom).unwrap(); | |
} | |
writer.flush().unwrap(); | |
println!("[+] list saved into idiom.csv!"); | |
} | |
#[cfg(test)] | |
mod test { | |
use super::*; | |
#[test] | |
fn onset_tab_should_parse_correct_onset_list() { | |
let tab = parse_onset_tab(); | |
assert_eq!(14, tab.onsets.len()); | |
} | |
#[test] | |
fn initial_tab_should_parse_correct_initial_list() { | |
let initial_tab = parse_initial_tab(0); | |
assert_eq!(55, initial_tab.initials.len()); | |
} | |
#[test] | |
fn word_list_should_have_equal_lengths_for_readings_and_meanings() { | |
let word_list = parse_word_list("가", 0); | |
assert_eq!(word_list.readings.len(), word_list.meanings.len()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment