-
-
Save Bentroen/4df9b8d5d052f9d14bc1a8531fe49994 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydub import AudioSegment | |
import numpy as np | |
class Mixer: | |
def __init__(self): | |
self.parts = [] | |
def __len__(self): | |
parts = self._sync() | |
seg = parts[0][1] | |
frame_count = max(offset + seg.frame_count() for offset, seg in parts) | |
return int(1000.0 * frame_count / seg.frame_rate) | |
def overlay(self, sound, position=0): | |
self.parts.append((position, sound)) | |
return self | |
def _sync(self): | |
positions, segs = zip(*self.parts) | |
frame_rate = segs[0].frame_rate | |
array_type = segs[0].array_type | |
offsets = [int(frame_rate * pos / 1000.0) for pos in positions] | |
segs = AudioSegment.empty()._sync(*segs) | |
return list(zip(offsets, segs)) | |
def append(self, sound): | |
self.overlay(sound, position=len(self)) | |
def to_audio_segment(self): | |
parts = self._sync() | |
seg = parts[0][1] | |
channels = seg.channels | |
frame_count = max(offset + seg.frame_count() for offset, seg in parts) | |
sample_count = int(frame_count * seg.channels) | |
# We use a larger data type so that clipping doesn't cause data loss | |
output = np.zeros(sample_count, dtype="int32") | |
for offset, seg in parts: | |
sample_offset = offset * channels | |
samples = np.frombuffer(seg.get_array_of_samples(), dtype="int16") | |
start = sample_offset | |
end = start + len(samples) | |
output[start:end] += samples | |
# The audio is then normalized to occupy the full "height" again | |
return seg._spawn(output, overrides={"sample_width": 4}).normalize(headroom=0.0) |
Well, 64-bit output won't work that well with pydub, I think. So one possibility would be to rescale the 32 Bit Input Array to 16 bit (in this case it's also automatically normalized)?
samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32")
samples = np.int16(samples/np.max(np.abs(samples)) * 32767)
If normalizing isn't wanted, one could also just use the maximal 32Bit integer Value as a dividend:
samples = np.int16(samples/2147483647 * 32767)
16 bit int max value: 32767
32 bit int max value: 2147483647
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @Peda1996! You're absolutely right. The code currently does not account for 32-bit WAV, only 16-bit! I only bothered making it support that because it was enough for what I needed, but I do eventually plan to fix this. :)
Your change is fine! Just keep in mind that the code is using an "oversized" array in order to account for clipping. So if you changed
samples
to 32-bit, you probably should changeoutput
to be 64-bit; otherwise, clipping may occur as it's adding the samples.Thank you for using the script; glad it got to be useful 😄