Created
November 16, 2018 04:13
-
-
Save andreaschandra/5850f82148b762542c3f7d0dcf28b006 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): | |
| triggered = False | |
| voiced_frames = [] | |
| for frame in frames: | |
| is_speech = vad.is_speech(frame.bytes, sample_rate) | |
| sys.stdout.write('1' if is_speech else '0') | |
| if not triggered: | |
| ring_buffer.append((frame, is_speech)) | |
| num_voiced = len([f for f, speech in ring_buffer if speech]) | |
| # If we're NOTTRIGGERED and more than 90% of the frames in | |
| # the ring buffer are voiced frames, then enter the | |
| # TRIGGERED state. | |
| if num_voiced > 0.9 * ring_buffer.maxlen: | |
| triggered = True | |
| sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) | |
| # We want to yield all the audio we see from now until | |
| # we are NOTTRIGGERED, but we have to start with the | |
| # audio that's already in the ring buffer. | |
| for f, s in ring_buffer: | |
| voiced_frames.append(f) | |
| ring_buffer.clear() | |
| else: | |
| # We're in the TRIGGERED state, so collect the audio data | |
| # and add it to the ring buffer. | |
| voiced_frames.append(frame) | |
| ring_buffer.append((frame, is_speech)) | |
| num_unvoiced = len([f for f, speech in ring_buffer if not speech]) | |
| # If more than 90% of the frames in the ring buffer are | |
| # unvoiced, then enter NOTTRIGGERED and yield whatever | |
| # audio we've collected. | |
| if num_unvoiced > 0.9 * ring_buffer.maxlen: | |
| sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) | |
| triggered = False | |
| yield b''.join([f.bytes for f in voiced_frames]) | |
| ring_buffer.clear() | |
| voiced_frames = [] | |
| if triggered: | |
| sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) | |
| sys.stdout.write('\n') | |
| # If we have any leftover voiced audio when we run out of input, | |
| # yield it. | |
| if voiced_frames: | |
| yield b''.join([f.bytes for f in voiced_frames]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment