Created
March 12, 2018 02:16
-
-
Save twocity/7b2941b12fd813d935025cf33325a11a to your computer and use it in GitHub Desktop.
Convert an audio file to txt through Google Speech API.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env kscript | |
@file:DependsOn("com.squareup.okhttp3:okhttp:3.10.0") | |
@file:DependsOn("com.google.code.gson:gson:2.8.2") | |
@file:DependsOn("commons-cli:commons-cli:1.4") | |
import com.google.gson.Gson | |
import com.google.gson.JsonParser | |
import okhttp3.Interceptor | |
import okhttp3.Interceptor.Chain | |
import okhttp3.MediaType | |
import okhttp3.OkHttpClient | |
import okhttp3.Request | |
import okhttp3.RequestBody | |
import okhttp3.Response | |
import okio.Okio | |
import org.apache.commons.cli.DefaultParser | |
import org.apache.commons.cli.HelpFormatter | |
import org.apache.commons.cli.MissingArgumentException | |
import org.apache.commons.cli.Option | |
import org.apache.commons.cli.Options | |
import java.io.File | |
import java.lang.ProcessBuilder.Redirect | |
import java.util.concurrent.TimeUnit.MILLISECONDS | |
/** | |
* Convert an audio file to txt through Google Speech API. | |
*/ | |
val accessTokenInterceptor = object : Interceptor { | |
override fun intercept(chain: Chain): Response { | |
val request = chain.request() | |
val newRequest = request.newBuilder() | |
.addHeader("Authorization", "Bearer ${accessToken()}") | |
.build() | |
return chain.proceed(newRequest) | |
} | |
} | |
val httpClient = OkHttpClient.Builder() | |
.addInterceptor(accessTokenInterceptor) | |
.build() | |
val gson = Gson() | |
val helpHeader = """ | |
A simple tool to convert audio to text by Google Cloud Speech API. | |
Please make sure Cloud SKD(https://cloud.google.com/sdk/) was installed before you get started. | |
Steps: | |
1. Transform audio encoding to LINEAR16, example: | |
ffmpeg -i input.mp3 -f s16le -acodec pcm_s16le -ac 1 -ar 16k output.raw | |
see https://cloud.google.com/speech/docs/best-practices | |
2. Upload output.raw: | |
gsutil cp output.raw gs://bucket-name | |
3. Create a long running async transcribe operation: | |
kscript transcribe.kts -t gs://bucket-name/output.raw | |
4. Query the result: | |
kscript transcribe.kts -q name | |
See https://cloud.google.com/speech/docs/async-recognize for more help. | |
""" | |
val options = Options().apply { | |
addOption("help", false, "print help information") | |
addOption(Option.builder("t") | |
.hasArg() | |
.argName("uri") | |
.desc("audio file") | |
.build()) | |
addOption(Option.builder("q") | |
.hasArg() | |
.argName("name") | |
.desc("query operation result by given name") | |
.build()) | |
} | |
val formatter = HelpFormatter() | |
try { | |
val command = DefaultParser().parse(options, args, true) | |
when { | |
command.hasOption("t") -> transcribe(command.getOptionValue("t")) | |
command.hasOption("q") -> query(command.getOptionValue("q")) | |
else -> formatter.printHelp("kscript blog.kts", helpHeader, options, null, true) | |
} | |
} catch (e: MissingArgumentException) { | |
println(e.message) | |
} | |
fun transcribe(uri: String) { | |
if (uri.startsWith("gs://")) { | |
val payload = Payload( | |
Config(encoding = "LINEAR16", sampleRateHertz = 16000, languageCode = "en-US"), Audio(uri)) | |
val request = Request.Builder().apply { | |
post(RequestBody.create(MediaType.parse("application/json"), gson.toJson(payload))) | |
url("https://speech.googleapis.com/v1/speech:longrunningrecognize") | |
}.build() | |
httpClient.newCall(request).execute().use { | |
if (it.isSuccessful) { | |
println("create operation successfully, use following command to query the result:") | |
val json = JsonParser().parse(it.body()!!.charStream()).asJsonObject | |
val name = json.get("name").asString | |
println("use kscript transcribe.kts -q $name") | |
} else { | |
println("bad request: \n${it.body()?.string()}") | |
} | |
} | |
} else { | |
println("invalid uri: $uri") | |
} | |
} | |
fun query(name: String) { | |
val request = Request.Builder().apply { | |
url("https://speech.googleapis.com/v1/operations/$name") | |
}.build() | |
println("querying $name...") | |
httpClient.newCall(request).execute().use { | |
if (it.isSuccessful) { | |
val json = JsonParser().parse(it.body()!!.charStream()).asJsonObject | |
val done = json.get("done")?.asBoolean ?: false | |
if (done) { | |
val scripts = json.getAsJsonObject("response").getAsJsonArray("results").map { | |
it.asJsonObject.getAsJsonArray("alternatives").first().asJsonObject.get( | |
"transcript").asString | |
} | |
val output = Okio.buffer(Okio.sink(File("$name.txt"))) | |
output.use { | |
scripts.forEach { | |
output.writeUtf8(it) | |
output.writeUtf8("\n") | |
} | |
} | |
println("transcribe done ^_^, see $name.txt") | |
} else { | |
println("operation not done yet, progress: ${json.getAsJsonObject("metadata").get( | |
"progressPercent")?.asInt ?: 0}%") | |
} | |
} else { | |
println("failed request: \n${it.body()?.string()}") | |
} | |
} | |
} | |
data class Payload(val config: Config, val audio: Audio) | |
data class Config(val encoding: String, val sampleRateHertz: Long, val languageCode: String) | |
data class Audio(val uri: String) | |
fun accessToken(): String = "gcloud auth application-default print-access-token".runThenGet().trim() | |
fun String.runThenGet(timeout: Long = 3000): String { | |
val process = ProcessBuilder() | |
.command(split(" ")) | |
.redirectOutput(Redirect.PIPE) | |
.start().apply { | |
waitFor(timeout, MILLISECONDS) | |
} | |
val output = Okio.buffer(Okio.source(process.inputStream)) | |
return output.readByteString().utf8() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage: