Created
July 31, 2020 19:58
-
-
Save MaPePeR/4bf437e5057f8b9f5a88f2a7b301ef5c to your computer and use it in GitHub Desktop.
ffmpeg filter to trim silence from beginning and end of an audio file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/libavfilter/Makefile b/libavfilter/Makefile | |
index 0dc74f8b70..18ac1d989f 100644 | |
--- a/libavfilter/Makefile | |
+++ b/libavfilter/Makefile | |
@@ -133,6 +133,7 @@ OBJS-$(CONFIG_SIDECHAINCOMPRESS_FILTER) += af_sidechaincompress.o | |
OBJS-$(CONFIG_SIDECHAINGATE_FILTER) += af_agate.o | |
OBJS-$(CONFIG_SILENCEDETECT_FILTER) += af_silencedetect.o | |
OBJS-$(CONFIG_SILENCEREMOVE_FILTER) += af_silenceremove.o | |
+OBJS-$(CONFIG_SILENCETRIM_FILTER) += af_silenceremove.o | |
OBJS-$(CONFIG_SOFALIZER_FILTER) += af_sofalizer.o | |
OBJS-$(CONFIG_STEREOTOOLS_FILTER) += af_stereotools.o | |
OBJS-$(CONFIG_STEREOWIDEN_FILTER) += af_stereowiden.o | |
diff --git a/libavfilter/af_silenceremove.c b/libavfilter/af_silenceremove.c | |
index 7dd8c5a1d8..0de1a956b4 100644 | |
--- a/libavfilter/af_silenceremove.c | |
+++ b/libavfilter/af_silenceremove.c | |
@@ -25,6 +25,7 @@ | |
#include "libavutil/opt.h" | |
#include "libavutil/timestamp.h" | |
+#include "libavutil/audio_fifo.h" | |
#include "audio.h" | |
#include "formats.h" | |
#include "avfilter.h" | |
@@ -45,7 +46,10 @@ enum SilenceMode { | |
SILENCE_TRIM_FLUSH, | |
SILENCE_COPY, | |
SILENCE_COPY_FLUSH, | |
- SILENCE_STOP | |
+ SILENCE_STOP, | |
+ SILENCETRIM_START, | |
+ SILENCETRIM_COPY, | |
+ SILENCETRIM_BUFFER, | |
}; | |
typedef struct SilenceRemoveContext { | |
@@ -98,6 +102,8 @@ typedef struct SilenceRemoveContext { | |
int detection; | |
void (*update)(struct SilenceRemoveContext *s, double sample); | |
double(*compute)(struct SilenceRemoveContext *s, double sample); | |
+ | |
+ AVAudioFifo *fifo; | |
} SilenceRemoveContext; | |
#define OFFSET(x) offsetof(SilenceRemoveContext, x) | |
@@ -679,3 +685,214 @@ AVFilter ff_af_silenceremove = { | |
.inputs = silenceremove_inputs, | |
.outputs = silenceremove_outputs, | |
}; | |
+ | |
+ | |
+ | |
+ | |
+static const AVOption silencetrim_options[] = { | |
+ { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF }, | |
+ { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF }, | |
+ { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, AF, "detection" }, | |
+ { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, AF, "detection" }, | |
+ { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, AF, "detection" }, | |
+ { "window", NULL, OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, AF }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFILTER_DEFINE_CLASS(silencetrim); | |
+ | |
+ | |
+static int trim_config_input(AVFilterLink *inlink) | |
+{ | |
+ AVFilterContext *ctx = inlink->dst; | |
+ SilenceRemoveContext *s = ctx->priv; | |
+ AVFilterLink *outlink = ctx->outputs[0]; | |
+ | |
+ s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels; | |
+ s->window = av_malloc_array(s->window_size, sizeof(*s->window)); | |
+ if (!s->window) | |
+ return AVERROR(ENOMEM); | |
+ | |
+ clear_window(s); | |
+ | |
+ s->mode = SILENCETRIM_START; | |
+ s->fifo = av_audio_fifo_alloc(outlink->format, outlink->channels, inlink->sample_rate * 4); | |
+ if (!s->fifo) { | |
+ return AVERROR(ENOMEM); | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int filter_subframe(AVFilterLink *inlink, AVFrame *in, int start_index, int end_index) { | |
+ AVFilterContext *ctx = inlink->dst; | |
+ AVFilterLink *outlink = ctx->outputs[0]; | |
+ SilenceRemoveContext *s = ctx->priv; | |
+ AVFrame *out = ff_get_audio_buffer(outlink, end_index - start_index + 1); | |
+ if (!out) { | |
+ return AVERROR(ENOMEM); | |
+ } | |
+ | |
+ memcpy(out->data[0], in->data[0] + start_index * inlink->channels * sizeof(double), out->nb_samples * inlink->channels * sizeof(double)); | |
+ out->pts = s->next_pts; | |
+ s->next_pts += av_rescale_q(out->nb_samples, | |
+ (AVRational){1, outlink->sample_rate}, | |
+ outlink->time_base); | |
+ return ff_filter_frame(outlink, out); | |
+} | |
+ | |
+static void buffer_frame_end(AVFilterLink *inlink, AVFrame *in, int start_index) { | |
+ AVFilterContext *ctx = inlink->dst; | |
+ SilenceRemoveContext *s = ctx->priv; | |
+ int out_samples = in->nb_samples - start_index; | |
+ double *sub_frame_data = (double*)in->data[0] + start_index * inlink ->channels; | |
+ av_audio_fifo_write(s->fifo, (void**)&sub_frame_data, out_samples); | |
+} | |
+ | |
+static int trim_filter_frame(AVFilterLink *inlink, AVFrame *in) | |
+{ | |
+ AVFilterContext *ctx = inlink->dst; | |
+ AVFilterLink *outlink = ctx->outputs[0]; | |
+ SilenceRemoveContext *s = ctx->priv; | |
+ double *ibuf = (double *)in->data[0]; | |
+ int i, j; | |
+ int above_start_threshold; | |
+ int above_stop_threshold; | |
+ double sample_volume; | |
+ int first_non_silence_sample_in_frame = -1; | |
+ int last_non_silence_sample_in_frame = -1; | |
+ int ret; | |
+ AVFrame *out; | |
+ | |
+ for (i = 0;i < in->nb_samples; i++) { | |
+ above_start_threshold = 0; | |
+ above_stop_threshold = 0; | |
+ for (j = 0; j < inlink->channels; j++) { | |
+ sample_volume = s->compute(s, ibuf[i * inlink->channels + j]); | |
+ //sample_volume might be NaN, but we want to get a false for this case anyway. (So NaN = assume silence) | |
+ above_start_threshold |= sample_volume > s->start_threshold; | |
+ above_stop_threshold |= sample_volume > s->stop_threshold; | |
+ } | |
+ if (above_start_threshold && first_non_silence_sample_in_frame == -1) { | |
+ first_non_silence_sample_in_frame = i; | |
+ last_non_silence_sample_in_frame = i; | |
+ } | |
+ if (above_stop_threshold) { | |
+ last_non_silence_sample_in_frame = i; | |
+ } | |
+ for (j = 0; j < inlink->channels; j++) { | |
+ s->update(s, ibuf[i * inlink->channels + j]); | |
+ } | |
+ | |
+ } | |
+ if (s->mode == SILENCETRIM_START) { | |
+ //We were trimming silence from the start of the audio stream | |
+ if (first_non_silence_sample_in_frame != -1) { | |
+ //The audio started playing in this frame | |
+ //At this point we discard the data that was used to calculate the window. | |
+ ret = filter_subframe(inlink, in, first_non_silence_sample_in_frame, last_non_silence_sample_in_frame); | |
+ if (ret < 0) { | |
+ return ret; | |
+ } | |
+ | |
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) { | |
+ //Silence also started in this frame: Buffer remaining samples | |
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1); | |
+ | |
+ s->mode = SILENCETRIM_BUFFER; | |
+ } else { | |
+ s->mode = SILENCETRIM_COPY; | |
+ } | |
+ } | |
+ } else if (s->mode == SILENCETRIM_COPY) { | |
+ //We are currently copying data, so we need to generate a frame with all unwritten data. | |
+ //Generate Frame from 0 to last_non_silence_sample_in_frame(Might be the whole frame) | |
+ assert(last_non_silence_sample_in_frame >= 0); | |
+ ret = filter_subframe(inlink, in, 0, last_non_silence_sample_in_frame); | |
+ if (ret < 0) { | |
+ return ret; | |
+ } | |
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) { | |
+ //Silence also started in this frame: Buffer remaining samples | |
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1); | |
+ | |
+ s->mode = SILENCETRIM_BUFFER; | |
+ } | |
+ } else if (s->mode == SILENCETRIM_BUFFER) { | |
+ //We are currently buffering silence, that might be the end of all audio. | |
+ if (first_non_silence_sample_in_frame != -1) { | |
+ //Silence ended in this frame, so buffered silence should not be trimmed. | |
+ //Generate Frame from buffer + frame from 0 to last_non_silence_sample_in_frame | |
+ if (av_audio_fifo_size(s->fifo) > 0) { | |
+ out = ff_get_audio_buffer(outlink, av_audio_fifo_size(s->fifo)); | |
+ if (!out) { | |
+ return AVERROR(ENOMEM); | |
+ } | |
+ av_audio_fifo_read(s->fifo, (void**)out->extended_data, out->nb_samples); | |
+ | |
+ out->pts = s->next_pts; | |
+ s->next_pts += av_rescale_q(out->nb_samples, | |
+ (AVRational){1, outlink->sample_rate}, | |
+ outlink->time_base); | |
+ ret = ff_filter_frame(outlink, out); | |
+ if (ret < 0) { | |
+ return ret; | |
+ } | |
+ } | |
+ ret = filter_subframe(inlink, in, 0, last_non_silence_sample_in_frame); | |
+ if (ret < 0) { | |
+ return ret; | |
+ } | |
+ | |
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) { | |
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1); | |
+ | |
+ s->mode = SILENCETRIM_BUFFER; | |
+ } else { | |
+ s->mode = SILENCETRIM_COPY; | |
+ } | |
+ } else { | |
+ //Silence did not end in this frame: Buffer the whole frame | |
+ av_audio_fifo_write(s->fifo, (void**)in->data, in->nb_samples); | |
+ } | |
+ } | |
+ | |
+ av_frame_free(&in); | |
+ return 0; | |
+} | |
+ | |
+static av_cold void trim_uninit(AVFilterContext *ctx) | |
+{ | |
+ SilenceRemoveContext *s = ctx->priv; | |
+ av_audio_fifo_free(s->fifo); | |
+} | |
+ | |
+static const AVFilterPad silencetrim_inputs[] = { | |
+ { | |
+ .name = "default", | |
+ .type = AVMEDIA_TYPE_AUDIO, | |
+ .config_props = trim_config_input, | |
+ .filter_frame = trim_filter_frame, | |
+ }, | |
+ { NULL } | |
+}; | |
+ | |
+static const AVFilterPad silencetrim_outputs[] = { | |
+ { | |
+ .name = "default", | |
+ .type = AVMEDIA_TYPE_AUDIO, | |
+ }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFilter ff_af_silencetrim = { | |
+ .name = "silencetrim", | |
+ .description = NULL_IF_CONFIG_SMALL("Trim silence from start and end of audio."), | |
+ .priv_size = sizeof(SilenceRemoveContext), | |
+ .priv_class = &silenceremove_class, | |
+ .init = init, | |
+ .uninit = trim_uninit, | |
+ .query_formats = query_formats, | |
+ .inputs = silencetrim_inputs, | |
+ .outputs = silencetrim_outputs, | |
+}; | |
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c | |
index 3f70153986..565c475ef4 100644 | |
--- a/libavfilter/allfilters.c | |
+++ b/libavfilter/allfilters.c | |
@@ -127,6 +127,7 @@ extern AVFilter ff_af_sidechaincompress; | |
extern AVFilter ff_af_sidechaingate; | |
extern AVFilter ff_af_silencedetect; | |
extern AVFilter ff_af_silenceremove; | |
+extern AVFilter ff_af_silencetrim; | |
extern AVFilter ff_af_sofalizer; | |
extern AVFilter ff_af_stereotools; | |
extern AVFilter ff_af_stereowiden; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment