Last active
March 27, 2021 22:56
-
-
Save rspeer/c5647d6a59ee59619c95bf7fec1769d5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This Rust code scans through the Common Crawl, looking for text that's | |
* not English. I suspect I may learn much later that it's terrible, | |
* unidiomatic Rust, but it would take me months to learn what good Rust is. | |
* | |
* We depend on some external libraries: | |
* | |
* - html5ever: an HTML parser (we only use its low-level tokenizer) | |
* - encoding: handles text in all the encodings that WHATWG recognizes | |
* - string_cache: interns a bunch of frequently-used strings, like tag names -- necessary to use | |
* the html5ever tokenizer | |
* - cld2: our favorite language detector | |
*/ | |
extern crate html5ever; | |
extern crate encoding; | |
#[macro_use(atom)] | |
extern crate string_cache; | |
extern crate cld2; | |
use std::io; | |
use std::io::prelude::*; | |
use std::default::Default; | |
use std::string::String; | |
use std::str; | |
use html5ever::tokenizer::{Tokenizer, TokenSink, Token, TagToken, CharacterTokens, StartTag, EndTag}; | |
use html5ever::tendril::{Tendril, fmt}; | |
use encoding::{Encoding, EncodingRef, DecoderTrap}; | |
use encoding::label::encoding_from_whatwg_label; | |
use encoding::all::{UTF_8}; | |
use cld2::{detect_language_ext, Format, Reliable, Lang, Hints}; | |
/* The following code is from http://stackoverflow.com/a/34978794/773754. | |
* This clever code adds an as_deref() method to Option objects, which allows converting an | |
* Option<String> into an Option<&str>. | |
*/ | |
use std::ops::Deref; | |
trait OptionDeref<T: Deref> { | |
fn as_deref(&self) -> Option<&T::Target>; | |
} | |
impl<T: Deref> OptionDeref<T> for Option<T> { | |
fn as_deref(&self) -> Option<&T::Target> { | |
self.as_ref().map(Deref::deref) | |
} | |
} | |
#[derive(PartialEq, Debug)] | |
enum ParserState { | |
WARC, | |
HTTP, | |
HTML, | |
} | |
/* The HTML tokenizer works in a streaming way by taking in a reference to a | |
* TokenSink that you implement. Our TokenSink is called the TokenHandler, and | |
* here's what its state looks like. | |
*/ | |
struct TokenHandler { | |
text: String, | |
encoding: EncodingRef, | |
active: bool, | |
language_hint: Option<String>, | |
language_depth: i64 | |
} | |
/* Methods of the TokenHandler that aren't part of the HTML parser. */ | |
impl TokenHandler { | |
fn new() -> TokenHandler { | |
TokenHandler { | |
text: String::with_capacity(65536), | |
encoding: UTF_8, | |
active: true, | |
language_hint: None, | |
// The language_depth is a counter that keeps track of our depth | |
// in the tag stack since we've seen a 'lang' attribute. When it | |
// reaches 0, it drops that 'lang' value. | |
// | |
// This makes us forgetful in a case that has nested 'lang' | |
// attributes, but that seems rare. | |
// | |
// When we haven't seen a 'lang' attribute, the counter is set to | |
// a high number, so it never hits 0. | |
language_depth: 1000 | |
} | |
} | |
/* We've started a new document, so language-detect and possibly output | |
* the text we've accumulated, then reset all the attributes. | |
*/ | |
fn new_document(&mut self) { | |
if self.text.len() > 0 { | |
handle_language(self.text.clone(), self.language_hint.clone()); | |
} | |
self.text.truncate(0); | |
self.encoding = UTF_8; | |
self.active = true; | |
self.language_hint = None; | |
self.language_depth = 1000; | |
} | |
/* We closed a tag with a 'lang' attribute. Send the text on to language | |
* detection, but don't forget about other things such as encoding. | |
*/ | |
fn end_language_section(&mut self) { | |
if self.text.len() > 0 { | |
handle_language(self.text.clone(), self.language_hint.clone()); | |
} | |
self.text.truncate(0); | |
self.language_hint = None; | |
self.language_depth = 1000; | |
} | |
} | |
/* The part of the TokenHandler that makes it a TokenSink, which is the | |
* process_token method. | |
*/ | |
impl TokenSink for TokenHandler { | |
fn process_token(&mut self, token: Token) { | |
match token { | |
TagToken(tag) => { | |
match tag.kind { | |
StartTag => { | |
// We've received an HTML opening tag. | |
match tag.name { | |
// If it's a <script> or <style> tag, start disregarding content. | |
atom!("script") | atom!("style") => { | |
self.active = false; | |
}, | |
// If it's a <meta> tag, look for a charset or http-equiv attribute. | |
atom!("meta") => { | |
let mut content_type = false; | |
for attr in &tag.attrs { | |
if attr.name.local == atom!("charset") { | |
match encoding_from_whatwg_label(&attr.value) { | |
Some(new_encoding) => self.encoding = new_encoding, | |
None => {} | |
} | |
} | |
if attr.name.local == atom!("http-equiv") && attr.value.to_lowercase() == "content-type" { | |
content_type = true; | |
} | |
} | |
if content_type { | |
for attr in &tag.attrs { | |
if attr.name.local == atom!("content") { | |
match content_type_to_encoding(&attr.value) { | |
Some(new_encoding) => { | |
self.encoding = new_encoding; | |
}, | |
None => {} | |
} | |
} | |
} | |
} | |
} | |
// Other start tags do nothing in particular. | |
_ => {} | |
}; | |
// Now look for a 'lang' attribute on the tag. If it has one, and the tag | |
// doesn't close immediately, start a new section of text that we believe | |
// to be in that language. | |
for attr in tag.attrs { | |
if attr.name.local == atom!("lang") { | |
self.end_language_section(); | |
self.language_hint = Some(attr.value.to_string()); | |
if !tag.self_closing { | |
self.language_depth = 0; | |
} | |
} | |
}; | |
if tag.self_closing { | |
// This could be a <br> tag or something -- it's a token boundary. | |
self.text.push(' '); | |
} | |
else { | |
// Increase our language_depth based on the fact that we saw a start tag -- | |
// this is how we will keep track of where a 'lang' attribute ends. | |
self.language_depth += 1; | |
} | |
}, | |
EndTag => { | |
// We've received an HTML closing tag. | |
match tag.name { | |
// If a <script> or <style> tag ended, stop ignoring content. | |
atom!("script") | atom!("style") => { | |
self.active = true; | |
}, | |
// Otherwise, the only important thing is that it's a token boundary. | |
_ => { | |
self.text.push(' '); | |
} | |
}; | |
// Decrease our language_depth, and end the language section if it reaches | |
// 0. | |
self.language_depth -= 1; | |
if self.language_depth == 0 { | |
self.end_language_section(); | |
} | |
} | |
}; | |
if tag.self_closing { self.text.push(' '); } | |
}, | |
CharacterTokens(tendril) => { | |
// We've received actual text. It's in the form of a Tendril, which is basically as | |
// frightening as it sounds, so convert it to a string, then convert that to a | |
// &str, so we can push it onto the text. | |
if self.active { | |
self.text.push_str(&tendril.to_string()); | |
} | |
}, | |
_ => {} | |
} | |
} | |
} | |
/* Language-detect the given text, and output it if it's non-English. */ | |
fn handle_language(text: String, language_hint: Option<String>) { | |
let hint_ref: Option<&str> = language_hint.as_deref(); | |
let hints = Hints { | |
content_language: hint_ref, | |
.. Default::default() | |
}; | |
let detection_result = detect_language_ext(&text, Format::Text, &hints); | |
if detection_result.reliability == Reliable { | |
match detection_result.language { | |
Some(Lang(language)) => { | |
if language != "en" { | |
println!("{}\t{}", language, text.replace("\r", "").replace("\n", " ")) | |
} | |
}, | |
None => {} | |
} | |
} | |
} | |
fn content_type_to_encoding(content_type: &str) -> Option<EncodingRef> { | |
let split1: Vec<&str> = content_type.split("charset=").collect(); | |
if split1.len() < 2 { return None }; | |
let split2: Vec<&str> = split1[1].split(" ").collect(); | |
let encoding_name: &str = split2[0]; | |
return encoding_from_whatwg_label(encoding_name); | |
} | |
fn content_type_bytes_to_encoding(content_type_bytes: &[u8]) -> Option<EncodingRef> { | |
match str::from_utf8(&content_type_bytes) { | |
Ok(content_type) => content_type_to_encoding(&content_type.trim()), | |
Err(_) => None | |
} | |
} | |
fn main() { | |
let stdin = io::stdin(); | |
let mut state = ParserState::WARC; | |
let mut tokenizer = Tokenizer::new(TokenHandler::new(), Default::default()); | |
for line_bytes_opt in stdin.lock().split(b'\n') { | |
let bline: Vec<u8> = line_bytes_opt.unwrap(); | |
if bline == b"WARC/1.0\r" { | |
state = ParserState::WARC; | |
tokenizer.sink_mut().new_document(); | |
} | |
else if bline == b"\r" { | |
// A blank line ends the HTTP headers, transitioning | |
// to HTML. | |
if state == ParserState::HTTP { | |
state = ParserState::HTML; | |
} | |
} | |
else if state == ParserState::WARC && bline.starts_with(b"HTTP/") { | |
state = ParserState::HTTP; | |
} | |
else if state == ParserState::HTML { | |
match tokenizer.sink().encoding.decode(&bline, DecoderTrap::Strict) { | |
Ok(sline) => { | |
let tend: Tendril<fmt::UTF8> = Tendril::from_slice(sline.trim()); | |
tokenizer.feed(tend); | |
}, | |
Err(_) => {} | |
} | |
} | |
else if state == ParserState::HTTP { | |
if bline.starts_with(b"Content-Type:") { | |
let content_type_bytes = &bline[13..]; | |
match content_type_bytes_to_encoding(&content_type_bytes) { | |
Some(new_encoding) => { | |
tokenizer.sink_mut().encoding = new_encoding; | |
}, | |
None => {} | |
} | |
} | |
else if bline.starts_with(b"Content-Language:") { | |
let content_lang_bytes = &bline[17..]; | |
match str::from_utf8(content_lang_bytes) { | |
Ok(content_lang) => { | |
tokenizer.sink_mut().language_hint = Some(content_lang.trim().to_string()); | |
}, | |
Err(_) => {} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment