Skip to content

Instantly share code, notes, and snippets.

@rspeer
Last active March 27, 2021 22:56
Show Gist options
  • Save rspeer/c5647d6a59ee59619c95bf7fec1769d5 to your computer and use it in GitHub Desktop.
Save rspeer/c5647d6a59ee59619c95bf7fec1769d5 to your computer and use it in GitHub Desktop.
/* This Rust code scans through the Common Crawl, looking for text that's
* not English. I suspect I may learn much later that it's terrible,
* unidiomatic Rust, but it would take me months to learn what good Rust is.
*
* We depend on some external libraries:
*
* - html5ever: an HTML parser (we only use its low-level tokenizer)
* - encoding: handles text in all the encodings that WHATWG recognizes
* - string_cache: interns a bunch of frequently-used strings, like tag names -- necessary to use
* the html5ever tokenizer
* - cld2: our favorite language detector
*/
extern crate html5ever;
extern crate encoding;
#[macro_use(atom)]
extern crate string_cache;
extern crate cld2;
use std::io;
use std::io::prelude::*;
use std::default::Default;
use std::string::String;
use std::str;
use html5ever::tokenizer::{Tokenizer, TokenSink, Token, TagToken, CharacterTokens, StartTag, EndTag};
use html5ever::tendril::{Tendril, fmt};
use encoding::{Encoding, EncodingRef, DecoderTrap};
use encoding::label::encoding_from_whatwg_label;
use encoding::all::{UTF_8};
use cld2::{detect_language_ext, Format, Reliable, Lang, Hints};
/* The following code is from http://stackoverflow.com/a/34978794/773754.
* This clever code adds an as_deref() method to Option objects, which allows converting an
* Option<String> into an Option<&str>.
*/
use std::ops::Deref;
trait OptionDeref<T: Deref> {
fn as_deref(&self) -> Option<&T::Target>;
}
impl<T: Deref> OptionDeref<T> for Option<T> {
fn as_deref(&self) -> Option<&T::Target> {
self.as_ref().map(Deref::deref)
}
}
#[derive(PartialEq, Debug)]
enum ParserState {
WARC,
HTTP,
HTML,
}
/* The HTML tokenizer works in a streaming way by taking in a reference to a
* TokenSink that you implement. Our TokenSink is called the TokenHandler, and
* here's what its state looks like.
*/
struct TokenHandler {
text: String,
encoding: EncodingRef,
active: bool,
language_hint: Option<String>,
language_depth: i64
}
/* Methods of the TokenHandler that aren't part of the HTML parser. */
impl TokenHandler {
fn new() -> TokenHandler {
TokenHandler {
text: String::with_capacity(65536),
encoding: UTF_8,
active: true,
language_hint: None,
// The language_depth is a counter that keeps track of our depth
// in the tag stack since we've seen a 'lang' attribute. When it
// reaches 0, it drops that 'lang' value.
//
// This makes us forgetful in a case that has nested 'lang'
// attributes, but that seems rare.
//
// When we haven't seen a 'lang' attribute, the counter is set to
// a high number, so it never hits 0.
language_depth: 1000
}
}
/* We've started a new document, so language-detect and possibly output
* the text we've accumulated, then reset all the attributes.
*/
fn new_document(&mut self) {
if self.text.len() > 0 {
handle_language(self.text.clone(), self.language_hint.clone());
}
self.text.truncate(0);
self.encoding = UTF_8;
self.active = true;
self.language_hint = None;
self.language_depth = 1000;
}
/* We closed a tag with a 'lang' attribute. Send the text on to language
* detection, but don't forget about other things such as encoding.
*/
fn end_language_section(&mut self) {
if self.text.len() > 0 {
handle_language(self.text.clone(), self.language_hint.clone());
}
self.text.truncate(0);
self.language_hint = None;
self.language_depth = 1000;
}
}
/* The part of the TokenHandler that makes it a TokenSink, which is the
* process_token method.
*/
impl TokenSink for TokenHandler {
fn process_token(&mut self, token: Token) {
match token {
TagToken(tag) => {
match tag.kind {
StartTag => {
// We've received an HTML opening tag.
match tag.name {
// If it's a <script> or <style> tag, start disregarding content.
atom!("script") | atom!("style") => {
self.active = false;
},
// If it's a <meta> tag, look for a charset or http-equiv attribute.
atom!("meta") => {
let mut content_type = false;
for attr in &tag.attrs {
if attr.name.local == atom!("charset") {
match encoding_from_whatwg_label(&attr.value) {
Some(new_encoding) => self.encoding = new_encoding,
None => {}
}
}
if attr.name.local == atom!("http-equiv") && attr.value.to_lowercase() == "content-type" {
content_type = true;
}
}
if content_type {
for attr in &tag.attrs {
if attr.name.local == atom!("content") {
match content_type_to_encoding(&attr.value) {
Some(new_encoding) => {
self.encoding = new_encoding;
},
None => {}
}
}
}
}
}
// Other start tags do nothing in particular.
_ => {}
};
// Now look for a 'lang' attribute on the tag. If it has one, and the tag
// doesn't close immediately, start a new section of text that we believe
// to be in that language.
for attr in tag.attrs {
if attr.name.local == atom!("lang") {
self.end_language_section();
self.language_hint = Some(attr.value.to_string());
if !tag.self_closing {
self.language_depth = 0;
}
}
};
if tag.self_closing {
// This could be a <br> tag or something -- it's a token boundary.
self.text.push(' ');
}
else {
// Increase our language_depth based on the fact that we saw a start tag --
// this is how we will keep track of where a 'lang' attribute ends.
self.language_depth += 1;
}
},
EndTag => {
// We've received an HTML closing tag.
match tag.name {
// If a <script> or <style> tag ended, stop ignoring content.
atom!("script") | atom!("style") => {
self.active = true;
},
// Otherwise, the only important thing is that it's a token boundary.
_ => {
self.text.push(' ');
}
};
// Decrease our language_depth, and end the language section if it reaches
// 0.
self.language_depth -= 1;
if self.language_depth == 0 {
self.end_language_section();
}
}
};
if tag.self_closing { self.text.push(' '); }
},
CharacterTokens(tendril) => {
// We've received actual text. It's in the form of a Tendril, which is basically as
// frightening as it sounds, so convert it to a string, then convert that to a
// &str, so we can push it onto the text.
if self.active {
self.text.push_str(&tendril.to_string());
}
},
_ => {}
}
}
}
/* Language-detect the given text, and output it if it's non-English. */
fn handle_language(text: String, language_hint: Option<String>) {
let hint_ref: Option<&str> = language_hint.as_deref();
let hints = Hints {
content_language: hint_ref,
.. Default::default()
};
let detection_result = detect_language_ext(&text, Format::Text, &hints);
if detection_result.reliability == Reliable {
match detection_result.language {
Some(Lang(language)) => {
if language != "en" {
println!("{}\t{}", language, text.replace("\r", "").replace("\n", " "))
}
},
None => {}
}
}
}
fn content_type_to_encoding(content_type: &str) -> Option<EncodingRef> {
let split1: Vec<&str> = content_type.split("charset=").collect();
if split1.len() < 2 { return None };
let split2: Vec<&str> = split1[1].split(" ").collect();
let encoding_name: &str = split2[0];
return encoding_from_whatwg_label(encoding_name);
}
fn content_type_bytes_to_encoding(content_type_bytes: &[u8]) -> Option<EncodingRef> {
match str::from_utf8(&content_type_bytes) {
Ok(content_type) => content_type_to_encoding(&content_type.trim()),
Err(_) => None
}
}
fn main() {
let stdin = io::stdin();
let mut state = ParserState::WARC;
let mut tokenizer = Tokenizer::new(TokenHandler::new(), Default::default());
for line_bytes_opt in stdin.lock().split(b'\n') {
let bline: Vec<u8> = line_bytes_opt.unwrap();
if bline == b"WARC/1.0\r" {
state = ParserState::WARC;
tokenizer.sink_mut().new_document();
}
else if bline == b"\r" {
// A blank line ends the HTTP headers, transitioning
// to HTML.
if state == ParserState::HTTP {
state = ParserState::HTML;
}
}
else if state == ParserState::WARC && bline.starts_with(b"HTTP/") {
state = ParserState::HTTP;
}
else if state == ParserState::HTML {
match tokenizer.sink().encoding.decode(&bline, DecoderTrap::Strict) {
Ok(sline) => {
let tend: Tendril<fmt::UTF8> = Tendril::from_slice(sline.trim());
tokenizer.feed(tend);
},
Err(_) => {}
}
}
else if state == ParserState::HTTP {
if bline.starts_with(b"Content-Type:") {
let content_type_bytes = &bline[13..];
match content_type_bytes_to_encoding(&content_type_bytes) {
Some(new_encoding) => {
tokenizer.sink_mut().encoding = new_encoding;
},
None => {}
}
}
else if bline.starts_with(b"Content-Language:") {
let content_lang_bytes = &bline[17..];
match str::from_utf8(content_lang_bytes) {
Ok(content_lang) => {
tokenizer.sink_mut().language_hint = Some(content_lang.trim().to_string());
},
Err(_) => {}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment