Skip to content

Instantly share code, notes, and snippets.

@tamamu
Created August 3, 2020 12:28
Show Gist options
  • Save tamamu/c24e614f04374caf0b84d92b3bf5769a to your computer and use it in GitHub Desktop.
Save tamamu/c24e614f04374caf0b84d92b3bf5769a to your computer and use it in GitHub Desktop.
HTMLパーサーを書く
/**
* HTMLパーサーを書くぞ
* 宣言: <!DECLARE>
* 開始タグ: <TAGNAME ATTRNAME=ATTRVAR>
* テキスト: TEXT
* 終了タグ: </TAGNAME>
*
*
*
*/
use std::io::Read;
use std::fs::File;
use std::path::Path;
#[derive(Debug)]
enum DOM {
Node {
tagname: String,
props: Vec<(String, String)>,
children: Vec<DOM>
},
Text(String)
}
#[derive(Debug)]
enum HTMLToken {
TagOpen,
TagClose,
Excl,
Equal,
Minus,
Amp(String),
TagName(String),
Text(String),
Key(String),
Value(String),
}
#[derive(Debug)]
struct HTMLParser {
src: Vec<char>, // UTF-8文字の配列
row: usize,
col: usize,
index: usize,
preserved: Option<Box<DOM>>,
is_in_tag: bool,
has_tagname: bool,
}
type HTMLParseResult = Result<DOM, String>;
impl HTMLParser {
fn new(src: String) -> Self {
Self {
// 文字列をUTF-8文字区切りにして配列にする
src: src.chars().collect(),
row: 0,
col: 0,
index: 0,
preserved: None,
is_in_tag: false,
has_tagname: false,
}
}
fn next_char(&mut self) -> char {
let cur = self.src[self.index];
self.index += 1;
if cur == '\n' || cur == '\r' {
self.row += 1;
self.col = 0;
}
cur
}
fn strip(&mut self) -> bool {
let mut has_whitespace = false;
loop {
let cur = self.src[self.index];
match cur {
' ' | '\t' => {
self.index += 1;
has_whitespace = true;
},
'\n' | '\r' => {
self.index += 1;
self.row += 1;
self.col = 0;
has_whitespace = true;
}
_ => break
}
}
has_whitespace
}
fn capture_until(&mut self, terminator: Vec<char>) -> usize {
let mut result = 0usize;
while self.index+result < self.src.len()-1 {
let cur = self.src[self.index+result];
if let Some(_) = terminator.iter().find(|&&ch| ch == cur) {
break;
}
result += 1;
}
result
}
fn tokenize(&mut self) -> Vec<HTMLToken> {
let mut result = Vec::new();
while self.index < self.src.len() {
if self.strip() {
result.push(HTMLToken::Text(" ".to_owned()));
}
let cur = self.src[self.index];
match cur {
'<' => {
result.push(HTMLToken::TagOpen);
self.is_in_tag = true;
self.has_tagname = false;
self.index += 1;
},
'>' => {
result.push(HTMLToken::TagClose);
self.is_in_tag = false;
self.index += 1;
},
'!' => {
result.push(HTMLToken::Excl);
self.index += 1;
},
'=' => {
result.push(HTMLToken::Equal);
self.index += 1;
},
'-' => {
result.push(HTMLToken::Minus);
self.index += 1;
},
'&' => {
let len = self.capture_until(vec![';']);
let tok = self.src[self.index..self.index+len].iter().collect();
result.push(HTMLToken::Amp(
tok
));
self.index += len;
},
'"' => {
self.index += 1;
let len = self.capture_until(vec!['"']);
let tok = self.src[self.index..self.index+len].iter().collect();
result.push(HTMLToken::Value(
tok
));
self.index += len + 1;
},
ch => {
if self.is_in_tag {
if self.has_tagname {
let len = self.capture_until(vec![' ', '\n', '\r', '>', '=']);
let tok = self.src[self.index..self.index+len].iter().collect();
result.push(HTMLToken::Key(
tok
));
self.index += len;
} else {
let len = self.capture_until(vec![' ', '\n', '\r', '>']);
let tok = self.src[self.index..self.index+len].iter().collect();
result.push(HTMLToken::TagName(
tok
));
self.index += len;
self.has_tagname = true;
}
} else {
let len = self.capture_until(vec!['<']);
let tok = self.src[self.index..self.index+len].iter().collect();
result.push(HTMLToken::Text(
tok
));
self.index += len;
}
}
}
}
result
}
// fn parse(&mut self) -> HTMLParseResult {
// self.strip();
// let cur = self.next_char();
// match self.src[self.index] {
// '<' => {
// self.parse_tag()
// },
// _ => self.parse_text()
// }
// }
// fn parse_tag(&mut self) -> HTMLParseResult {
// let cur = self.next_char();
// match cur {
// '!' => {
// self.col += 1;
// self.index += 1;
// let _ = self.parse_declaration();
// self.parse()
// },
// _ => {
// self.col += 1;
// self.index += 1;
// let tagname = self.parse_tag_name();
// self.strip();
// let props = self.parse_props();
// self.strip();
// if self.src[self.index] == '>' {
// self.col += 1;
// self.index += 1;
// }
// }
// }
// }
// fn parse_text(&mut self) -> HTMLParseResult {
// unimplemented!()
// }
}
fn read_string_from_file(path: &std::path::Path) -> String {
let mut file = match File::open(&path) {
Err(why) => panic!(
"couldn't open {}: {}", path.to_string_lossy(), why),
Ok(file) => file,
};
let mut s = String::new();
match file.read_to_string(&mut s) {
Err(why) => panic!(
"couldn't read {}: {}",path.to_string_lossy(), why),
Ok(_) => {}
}
s
}
fn main() {
let data = read_string_from_file(Path::new("test.html"));
let mut parser = HTMLParser::new(data);
println!("{:?}", parser);
let tokens = parser.tokenize();
for token in tokens {
match token {
HTMLToken::TagOpen => {print!("<")}
HTMLToken::TagClose => {print!(">")}
HTMLToken::TagName(s) => {print!("{}", s)}
HTMLToken::Text(s) => {print!("{}", s)}
HTMLToken::Value(s) => {print!("\"{}\"", s)}
HTMLToken::Amp(s) => {print!("&{};", s)}
HTMLToken::Equal => {print!("=")}
HTMLToken::Excl => {print!("!")}
HTMLToken::Key(s) => {print!("{}", s)}
HTMLToken::Minus => {print!("-")}
}
}
//println!("{:?}", tokens);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment