Created
August 3, 2020 12:28
-
-
Save tamamu/c24e614f04374caf0b84d92b3bf5769a to your computer and use it in GitHub Desktop.
HTMLパーサーを書く
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* HTMLパーサーを書くぞ | |
* 宣言: <!DECLARE> | |
* 開始タグ: <TAGNAME ATTRNAME=ATTRVAR> | |
* テキスト: TEXT | |
* 終了タグ: </TAGNAME> | |
* | |
* | |
* | |
*/ | |
use std::io::Read; | |
use std::fs::File; | |
use std::path::Path; | |
#[derive(Debug)] | |
enum DOM { | |
Node { | |
tagname: String, | |
props: Vec<(String, String)>, | |
children: Vec<DOM> | |
}, | |
Text(String) | |
} | |
#[derive(Debug)] | |
enum HTMLToken { | |
TagOpen, | |
TagClose, | |
Excl, | |
Equal, | |
Minus, | |
Amp(String), | |
TagName(String), | |
Text(String), | |
Key(String), | |
Value(String), | |
} | |
#[derive(Debug)] | |
struct HTMLParser { | |
src: Vec<char>, // UTF-8文字の配列 | |
row: usize, | |
col: usize, | |
index: usize, | |
preserved: Option<Box<DOM>>, | |
is_in_tag: bool, | |
has_tagname: bool, | |
} | |
type HTMLParseResult = Result<DOM, String>; | |
impl HTMLParser { | |
fn new(src: String) -> Self { | |
Self { | |
// 文字列をUTF-8文字区切りにして配列にする | |
src: src.chars().collect(), | |
row: 0, | |
col: 0, | |
index: 0, | |
preserved: None, | |
is_in_tag: false, | |
has_tagname: false, | |
} | |
} | |
fn next_char(&mut self) -> char { | |
let cur = self.src[self.index]; | |
self.index += 1; | |
if cur == '\n' || cur == '\r' { | |
self.row += 1; | |
self.col = 0; | |
} | |
cur | |
} | |
fn strip(&mut self) -> bool { | |
let mut has_whitespace = false; | |
loop { | |
let cur = self.src[self.index]; | |
match cur { | |
' ' | '\t' => { | |
self.index += 1; | |
has_whitespace = true; | |
}, | |
'\n' | '\r' => { | |
self.index += 1; | |
self.row += 1; | |
self.col = 0; | |
has_whitespace = true; | |
} | |
_ => break | |
} | |
} | |
has_whitespace | |
} | |
fn capture_until(&mut self, terminator: Vec<char>) -> usize { | |
let mut result = 0usize; | |
while self.index+result < self.src.len()-1 { | |
let cur = self.src[self.index+result]; | |
if let Some(_) = terminator.iter().find(|&&ch| ch == cur) { | |
break; | |
} | |
result += 1; | |
} | |
result | |
} | |
fn tokenize(&mut self) -> Vec<HTMLToken> { | |
let mut result = Vec::new(); | |
while self.index < self.src.len() { | |
if self.strip() { | |
result.push(HTMLToken::Text(" ".to_owned())); | |
} | |
let cur = self.src[self.index]; | |
match cur { | |
'<' => { | |
result.push(HTMLToken::TagOpen); | |
self.is_in_tag = true; | |
self.has_tagname = false; | |
self.index += 1; | |
}, | |
'>' => { | |
result.push(HTMLToken::TagClose); | |
self.is_in_tag = false; | |
self.index += 1; | |
}, | |
'!' => { | |
result.push(HTMLToken::Excl); | |
self.index += 1; | |
}, | |
'=' => { | |
result.push(HTMLToken::Equal); | |
self.index += 1; | |
}, | |
'-' => { | |
result.push(HTMLToken::Minus); | |
self.index += 1; | |
}, | |
'&' => { | |
let len = self.capture_until(vec![';']); | |
let tok = self.src[self.index..self.index+len].iter().collect(); | |
result.push(HTMLToken::Amp( | |
tok | |
)); | |
self.index += len; | |
}, | |
'"' => { | |
self.index += 1; | |
let len = self.capture_until(vec!['"']); | |
let tok = self.src[self.index..self.index+len].iter().collect(); | |
result.push(HTMLToken::Value( | |
tok | |
)); | |
self.index += len + 1; | |
}, | |
ch => { | |
if self.is_in_tag { | |
if self.has_tagname { | |
let len = self.capture_until(vec![' ', '\n', '\r', '>', '=']); | |
let tok = self.src[self.index..self.index+len].iter().collect(); | |
result.push(HTMLToken::Key( | |
tok | |
)); | |
self.index += len; | |
} else { | |
let len = self.capture_until(vec![' ', '\n', '\r', '>']); | |
let tok = self.src[self.index..self.index+len].iter().collect(); | |
result.push(HTMLToken::TagName( | |
tok | |
)); | |
self.index += len; | |
self.has_tagname = true; | |
} | |
} else { | |
let len = self.capture_until(vec!['<']); | |
let tok = self.src[self.index..self.index+len].iter().collect(); | |
result.push(HTMLToken::Text( | |
tok | |
)); | |
self.index += len; | |
} | |
} | |
} | |
} | |
result | |
} | |
// fn parse(&mut self) -> HTMLParseResult { | |
// self.strip(); | |
// let cur = self.next_char(); | |
// match self.src[self.index] { | |
// '<' => { | |
// self.parse_tag() | |
// }, | |
// _ => self.parse_text() | |
// } | |
// } | |
// fn parse_tag(&mut self) -> HTMLParseResult { | |
// let cur = self.next_char(); | |
// match cur { | |
// '!' => { | |
// self.col += 1; | |
// self.index += 1; | |
// let _ = self.parse_declaration(); | |
// self.parse() | |
// }, | |
// _ => { | |
// self.col += 1; | |
// self.index += 1; | |
// let tagname = self.parse_tag_name(); | |
// self.strip(); | |
// let props = self.parse_props(); | |
// self.strip(); | |
// if self.src[self.index] == '>' { | |
// self.col += 1; | |
// self.index += 1; | |
// } | |
// } | |
// } | |
// } | |
// fn parse_text(&mut self) -> HTMLParseResult { | |
// unimplemented!() | |
// } | |
} | |
fn read_string_from_file(path: &std::path::Path) -> String { | |
let mut file = match File::open(&path) { | |
Err(why) => panic!( | |
"couldn't open {}: {}", path.to_string_lossy(), why), | |
Ok(file) => file, | |
}; | |
let mut s = String::new(); | |
match file.read_to_string(&mut s) { | |
Err(why) => panic!( | |
"couldn't read {}: {}",path.to_string_lossy(), why), | |
Ok(_) => {} | |
} | |
s | |
} | |
fn main() { | |
let data = read_string_from_file(Path::new("test.html")); | |
let mut parser = HTMLParser::new(data); | |
println!("{:?}", parser); | |
let tokens = parser.tokenize(); | |
for token in tokens { | |
match token { | |
HTMLToken::TagOpen => {print!("<")} | |
HTMLToken::TagClose => {print!(">")} | |
HTMLToken::TagName(s) => {print!("{}", s)} | |
HTMLToken::Text(s) => {print!("{}", s)} | |
HTMLToken::Value(s) => {print!("\"{}\"", s)} | |
HTMLToken::Amp(s) => {print!("&{};", s)} | |
HTMLToken::Equal => {print!("=")} | |
HTMLToken::Excl => {print!("!")} | |
HTMLToken::Key(s) => {print!("{}", s)} | |
HTMLToken::Minus => {print!("-")} | |
} | |
} | |
//println!("{:?}", tokens); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment