Created
March 11, 2022 00:45
-
-
Save adamchalmers/a8217ac40591edb685d57de85f210ce2 to your computer and use it in GitHub Desktop.
Async read Microsoft Word files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// async_zip = { git = "https://github.com/majored/rs-async-zip.git", branch = "main" } | |
// quick-xml = { git = "https://github.com/itsgreggreg/quick-xml.git", branch = "async", features = ["asynchronous"]} | |
use async_zip::read::seek::ZipFileReader; | |
use quick_xml::{events::Event, AsyncReader}; | |
use tokio::io::BufReader; | |
use tokio::sync::mpsc; | |
#[tokio::main] | |
async fn main() -> anyhow::Result<()> { | |
let (tx, mut rx) = mpsc::channel(2); | |
tokio::spawn(async move { | |
for line in rx.recv().await { | |
println!("{line}"); | |
} | |
}); | |
text_in_docx("data/notes.docx", tx).await?; | |
Ok(()) | |
} | |
pub async fn text_in_docx(path: &str, tx: mpsc::Sender<String>) -> anyhow::Result<()> { | |
let file = tokio::fs::File::open(path).await?; | |
// .docx files are ZIP archives. | |
let mut archive = ZipFileReader::new(file) | |
.await | |
.map_err(|e| anyhow::anyhow!("some error: {e:?}"))?; | |
/// There's a file inside the ZIP archive which contains the human-readable text. | |
const TEXTFILE: &str = "word/document.xml"; | |
let (document_index, _document_entry) = archive | |
.entry(TEXTFILE) | |
.ok_or_else(|| anyhow::anyhow!("{TEXTFILE} was not found in the ZIP"))?; | |
let body = archive | |
.entry_reader(document_index) | |
.await | |
.map(BufReader::new) | |
.map_err(|e| anyhow::anyhow!("{TEXTFILE} could not be read from the ZIP: {e:?}"))?; | |
// The textfile is XML, and the text is contained in the XML nodes. | |
// Asynchronously parse the XML and find the text. | |
let mut reader = AsyncReader::from_reader(body); | |
reader.trim_text(true); | |
let mut buf: Vec<u8> = Vec::new(); | |
loop { | |
match reader.read_event(&mut buf).await { | |
Ok(Event::Text(e)) => { | |
let text = e.unescape_and_decode(&reader)?; | |
tx.send(text).await?; | |
} | |
Err(e) => anyhow::bail!("Error at position {}: {:?}", reader.buffer_position(), e), | |
Ok(Event::Eof) => break, | |
_ => continue, | |
} | |
buf.clear(); | |
} | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment