Skip to content

Instantly share code, notes, and snippets.

@twocity
Created March 12, 2018 02:16
Show Gist options
  • Save twocity/7b2941b12fd813d935025cf33325a11a to your computer and use it in GitHub Desktop.
Save twocity/7b2941b12fd813d935025cf33325a11a to your computer and use it in GitHub Desktop.
Convert an audio file to txt through Google Speech API.
#!/usr/bin/env kscript
@file:DependsOn("com.squareup.okhttp3:okhttp:3.10.0")
@file:DependsOn("com.google.code.gson:gson:2.8.2")
@file:DependsOn("commons-cli:commons-cli:1.4")
import com.google.gson.Gson
import com.google.gson.JsonParser
import okhttp3.Interceptor
import okhttp3.Interceptor.Chain
import okhttp3.MediaType
import okhttp3.OkHttpClient
import okhttp3.Request
import okhttp3.RequestBody
import okhttp3.Response
import okio.Okio
import org.apache.commons.cli.DefaultParser
import org.apache.commons.cli.HelpFormatter
import org.apache.commons.cli.MissingArgumentException
import org.apache.commons.cli.Option
import org.apache.commons.cli.Options
import java.io.File
import java.lang.ProcessBuilder.Redirect
import java.util.concurrent.TimeUnit.MILLISECONDS
/**
* Convert an audio file to txt through Google Speech API.
*/
val accessTokenInterceptor = object : Interceptor {
override fun intercept(chain: Chain): Response {
val request = chain.request()
val newRequest = request.newBuilder()
.addHeader("Authorization", "Bearer ${accessToken()}")
.build()
return chain.proceed(newRequest)
}
}
val httpClient = OkHttpClient.Builder()
.addInterceptor(accessTokenInterceptor)
.build()
val gson = Gson()
val helpHeader = """
A simple tool to convert audio to text by Google Cloud Speech API.
Please make sure Cloud SKD(https://cloud.google.com/sdk/) was installed before you get started.
Steps:
1. Transform audio encoding to LINEAR16, example:
ffmpeg -i input.mp3 -f s16le -acodec pcm_s16le -ac 1 -ar 16k output.raw
see https://cloud.google.com/speech/docs/best-practices
2. Upload output.raw:
gsutil cp output.raw gs://bucket-name
3. Create a long running async transcribe operation:
kscript transcribe.kts -t gs://bucket-name/output.raw
4. Query the result:
kscript transcribe.kts -q name
See https://cloud.google.com/speech/docs/async-recognize for more help.
"""
val options = Options().apply {
addOption("help", false, "print help information")
addOption(Option.builder("t")
.hasArg()
.argName("uri")
.desc("audio file")
.build())
addOption(Option.builder("q")
.hasArg()
.argName("name")
.desc("query operation result by given name")
.build())
}
val formatter = HelpFormatter()
try {
val command = DefaultParser().parse(options, args, true)
when {
command.hasOption("t") -> transcribe(command.getOptionValue("t"))
command.hasOption("q") -> query(command.getOptionValue("q"))
else -> formatter.printHelp("kscript blog.kts", helpHeader, options, null, true)
}
} catch (e: MissingArgumentException) {
println(e.message)
}
fun transcribe(uri: String) {
if (uri.startsWith("gs://")) {
val payload = Payload(
Config(encoding = "LINEAR16", sampleRateHertz = 16000, languageCode = "en-US"), Audio(uri))
val request = Request.Builder().apply {
post(RequestBody.create(MediaType.parse("application/json"), gson.toJson(payload)))
url("https://speech.googleapis.com/v1/speech:longrunningrecognize")
}.build()
httpClient.newCall(request).execute().use {
if (it.isSuccessful) {
println("create operation successfully, use following command to query the result:")
val json = JsonParser().parse(it.body()!!.charStream()).asJsonObject
val name = json.get("name").asString
println("use kscript transcribe.kts -q $name")
} else {
println("bad request: \n${it.body()?.string()}")
}
}
} else {
println("invalid uri: $uri")
}
}
fun query(name: String) {
val request = Request.Builder().apply {
url("https://speech.googleapis.com/v1/operations/$name")
}.build()
println("querying $name...")
httpClient.newCall(request).execute().use {
if (it.isSuccessful) {
val json = JsonParser().parse(it.body()!!.charStream()).asJsonObject
val done = json.get("done")?.asBoolean ?: false
if (done) {
val scripts = json.getAsJsonObject("response").getAsJsonArray("results").map {
it.asJsonObject.getAsJsonArray("alternatives").first().asJsonObject.get(
"transcript").asString
}
val output = Okio.buffer(Okio.sink(File("$name.txt")))
output.use {
scripts.forEach {
output.writeUtf8(it)
output.writeUtf8("\n")
}
}
println("transcribe done ^_^, see $name.txt")
} else {
println("operation not done yet, progress: ${json.getAsJsonObject("metadata").get(
"progressPercent")?.asInt ?: 0}%")
}
} else {
println("failed request: \n${it.body()?.string()}")
}
}
}
data class Payload(val config: Config, val audio: Audio)
data class Config(val encoding: String, val sampleRateHertz: Long, val languageCode: String)
data class Audio(val uri: String)
fun accessToken(): String = "gcloud auth application-default print-access-token".runThenGet().trim()
fun String.runThenGet(timeout: Long = 3000): String {
val process = ProcessBuilder()
.command(split(" "))
.redirectOutput(Redirect.PIPE)
.start().apply {
waitFor(timeout, MILLISECONDS)
}
val output = Okio.buffer(Okio.source(process.inputStream))
return output.readByteString().utf8()
}
@twocity
Copy link
Author

twocity commented Mar 12, 2018

Usage:

  1. install kscript
  2. kscript transcribe.kts -help

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment