Skip to content

Instantly share code, notes, and snippets.

@adamchalmers
Created March 11, 2022 00:45
Show Gist options
  • Save adamchalmers/a8217ac40591edb685d57de85f210ce2 to your computer and use it in GitHub Desktop.
Save adamchalmers/a8217ac40591edb685d57de85f210ce2 to your computer and use it in GitHub Desktop.
Async read Microsoft Word files.
// async_zip = { git = "https://github.com/majored/rs-async-zip.git", branch = "main" }
// quick-xml = { git = "https://github.com/itsgreggreg/quick-xml.git", branch = "async", features = ["asynchronous"]}
use async_zip::read::seek::ZipFileReader;
use quick_xml::{events::Event, AsyncReader};
use tokio::io::BufReader;
use tokio::sync::mpsc;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let (tx, mut rx) = mpsc::channel(2);
tokio::spawn(async move {
for line in rx.recv().await {
println!("{line}");
}
});
text_in_docx("data/notes.docx", tx).await?;
Ok(())
}
pub async fn text_in_docx(path: &str, tx: mpsc::Sender<String>) -> anyhow::Result<()> {
let file = tokio::fs::File::open(path).await?;
// .docx files are ZIP archives.
let mut archive = ZipFileReader::new(file)
.await
.map_err(|e| anyhow::anyhow!("some error: {e:?}"))?;
/// There's a file inside the ZIP archive which contains the human-readable text.
const TEXTFILE: &str = "word/document.xml";
let (document_index, _document_entry) = archive
.entry(TEXTFILE)
.ok_or_else(|| anyhow::anyhow!("{TEXTFILE} was not found in the ZIP"))?;
let body = archive
.entry_reader(document_index)
.await
.map(BufReader::new)
.map_err(|e| anyhow::anyhow!("{TEXTFILE} could not be read from the ZIP: {e:?}"))?;
// The textfile is XML, and the text is contained in the XML nodes.
// Asynchronously parse the XML and find the text.
let mut reader = AsyncReader::from_reader(body);
reader.trim_text(true);
let mut buf: Vec<u8> = Vec::new();
loop {
match reader.read_event(&mut buf).await {
Ok(Event::Text(e)) => {
let text = e.unescape_and_decode(&reader)?;
tx.send(text).await?;
}
Err(e) => anyhow::bail!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => continue,
}
buf.clear();
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment