Last active
May 31, 2024 13:59
-
-
Save phialahydrite/374121e3ce679f98ff020415c351bf35 to your computer and use it in GitHub Desktop.
Python program to automatically generate onset-detected splices in Morphagene reels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Usage: | |
morphagene_onset.py -w <inputwavfile> -o <outputfile> -s <splicecount> | |
Use the Superflux onset detection algorithm with backtracking to generate | |
splice locations. | |
Use these splice locations with a converted WAV (to 32-bit float / 48000Hz) | |
to make Morphagene reels. | |
This method typically generates splices on each percussion hit of a sample, | |
so be careful to choose an appropriate length sample or quickly exceed the | |
limitations of the Morphagene [300 splices] using [splicecount]. | |
Uses scipy code (1.6.1) and other works like wavfile.py (enhanced) by josephernest | |
""" | |
import librosa | |
import sys, getopt, os | |
import struct | |
import numpy as np | |
from scipy import interpolate | |
import warnings | |
import io | |
from enum import IntEnum | |
__all__ = [ | |
'WavFileWarning', | |
'read', | |
'write' | |
] | |
class WavFileWarning(UserWarning): | |
pass | |
class WAVE_FORMAT(IntEnum): | |
""" | |
WAVE form wFormatTag IDs | |
Complete list is in mmreg.h in Windows 10 SDK. ALAC and OPUS are the | |
newest additions, in v10.0.14393 2016-07 | |
""" | |
UNKNOWN = 0x0000 | |
PCM = 0x0001 | |
ADPCM = 0x0002 | |
IEEE_FLOAT = 0x0003 | |
VSELP = 0x0004 | |
IBM_CVSD = 0x0005 | |
ALAW = 0x0006 | |
MULAW = 0x0007 | |
DTS = 0x0008 | |
DRM = 0x0009 | |
WMAVOICE9 = 0x000A | |
WMAVOICE10 = 0x000B | |
OKI_ADPCM = 0x0010 | |
DVI_ADPCM = 0x0011 | |
IMA_ADPCM = 0x0011 # Duplicate | |
MEDIASPACE_ADPCM = 0x0012 | |
SIERRA_ADPCM = 0x0013 | |
G723_ADPCM = 0x0014 | |
DIGISTD = 0x0015 | |
DIGIFIX = 0x0016 | |
DIALOGIC_OKI_ADPCM = 0x0017 | |
MEDIAVISION_ADPCM = 0x0018 | |
CU_CODEC = 0x0019 | |
HP_DYN_VOICE = 0x001A | |
YAMAHA_ADPCM = 0x0020 | |
SONARC = 0x0021 | |
DSPGROUP_TRUESPEECH = 0x0022 | |
ECHOSC1 = 0x0023 | |
AUDIOFILE_AF36 = 0x0024 | |
APTX = 0x0025 | |
AUDIOFILE_AF10 = 0x0026 | |
PROSODY_1612 = 0x0027 | |
LRC = 0x0028 | |
DOLBY_AC2 = 0x0030 | |
GSM610 = 0x0031 | |
MSNAUDIO = 0x0032 | |
ANTEX_ADPCME = 0x0033 | |
CONTROL_RES_VQLPC = 0x0034 | |
DIGIREAL = 0x0035 | |
DIGIADPCM = 0x0036 | |
CONTROL_RES_CR10 = 0x0037 | |
NMS_VBXADPCM = 0x0038 | |
CS_IMAADPCM = 0x0039 | |
ECHOSC3 = 0x003A | |
ROCKWELL_ADPCM = 0x003B | |
ROCKWELL_DIGITALK = 0x003C | |
XEBEC = 0x003D | |
G721_ADPCM = 0x0040 | |
G728_CELP = 0x0041 | |
MSG723 = 0x0042 | |
INTEL_G723_1 = 0x0043 | |
INTEL_G729 = 0x0044 | |
SHARP_G726 = 0x0045 | |
MPEG = 0x0050 | |
RT24 = 0x0052 | |
PAC = 0x0053 | |
MPEGLAYER3 = 0x0055 | |
LUCENT_G723 = 0x0059 | |
CIRRUS = 0x0060 | |
ESPCM = 0x0061 | |
VOXWARE = 0x0062 | |
CANOPUS_ATRAC = 0x0063 | |
G726_ADPCM = 0x0064 | |
G722_ADPCM = 0x0065 | |
DSAT = 0x0066 | |
DSAT_DISPLAY = 0x0067 | |
VOXWARE_BYTE_ALIGNED = 0x0069 | |
VOXWARE_AC8 = 0x0070 | |
VOXWARE_AC10 = 0x0071 | |
VOXWARE_AC16 = 0x0072 | |
VOXWARE_AC20 = 0x0073 | |
VOXWARE_RT24 = 0x0074 | |
VOXWARE_RT29 = 0x0075 | |
VOXWARE_RT29HW = 0x0076 | |
VOXWARE_VR12 = 0x0077 | |
VOXWARE_VR18 = 0x0078 | |
VOXWARE_TQ40 = 0x0079 | |
VOXWARE_SC3 = 0x007A | |
VOXWARE_SC3_1 = 0x007B | |
SOFTSOUND = 0x0080 | |
VOXWARE_TQ60 = 0x0081 | |
MSRT24 = 0x0082 | |
G729A = 0x0083 | |
MVI_MVI2 = 0x0084 | |
DF_G726 = 0x0085 | |
DF_GSM610 = 0x0086 | |
ISIAUDIO = 0x0088 | |
ONLIVE = 0x0089 | |
MULTITUDE_FT_SX20 = 0x008A | |
INFOCOM_ITS_G721_ADPCM = 0x008B | |
CONVEDIA_G729 = 0x008C | |
CONGRUENCY = 0x008D | |
SBC24 = 0x0091 | |
DOLBY_AC3_SPDIF = 0x0092 | |
MEDIASONIC_G723 = 0x0093 | |
PROSODY_8KBPS = 0x0094 | |
ZYXEL_ADPCM = 0x0097 | |
PHILIPS_LPCBB = 0x0098 | |
PACKED = 0x0099 | |
MALDEN_PHONYTALK = 0x00A0 | |
RACAL_RECORDER_GSM = 0x00A1 | |
RACAL_RECORDER_G720_A = 0x00A2 | |
RACAL_RECORDER_G723_1 = 0x00A3 | |
RACAL_RECORDER_TETRA_ACELP = 0x00A4 | |
NEC_AAC = 0x00B0 | |
RAW_AAC1 = 0x00FF | |
RHETOREX_ADPCM = 0x0100 | |
IRAT = 0x0101 | |
VIVO_G723 = 0x0111 | |
VIVO_SIREN = 0x0112 | |
PHILIPS_CELP = 0x0120 | |
PHILIPS_GRUNDIG = 0x0121 | |
DIGITAL_G723 = 0x0123 | |
SANYO_LD_ADPCM = 0x0125 | |
SIPROLAB_ACEPLNET = 0x0130 | |
SIPROLAB_ACELP4800 = 0x0131 | |
SIPROLAB_ACELP8V3 = 0x0132 | |
SIPROLAB_G729 = 0x0133 | |
SIPROLAB_G729A = 0x0134 | |
SIPROLAB_KELVIN = 0x0135 | |
VOICEAGE_AMR = 0x0136 | |
G726ADPCM = 0x0140 | |
DICTAPHONE_CELP68 = 0x0141 | |
DICTAPHONE_CELP54 = 0x0142 | |
QUALCOMM_PUREVOICE = 0x0150 | |
QUALCOMM_HALFRATE = 0x0151 | |
TUBGSM = 0x0155 | |
MSAUDIO1 = 0x0160 | |
WMAUDIO2 = 0x0161 | |
WMAUDIO3 = 0x0162 | |
WMAUDIO_LOSSLESS = 0x0163 | |
WMASPDIF = 0x0164 | |
UNISYS_NAP_ADPCM = 0x0170 | |
UNISYS_NAP_ULAW = 0x0171 | |
UNISYS_NAP_ALAW = 0x0172 | |
UNISYS_NAP_16K = 0x0173 | |
SYCOM_ACM_SYC008 = 0x0174 | |
SYCOM_ACM_SYC701_G726L = 0x0175 | |
SYCOM_ACM_SYC701_CELP54 = 0x0176 | |
SYCOM_ACM_SYC701_CELP68 = 0x0177 | |
KNOWLEDGE_ADVENTURE_ADPCM = 0x0178 | |
FRAUNHOFER_IIS_MPEG2_AAC = 0x0180 | |
DTS_DS = 0x0190 | |
CREATIVE_ADPCM = 0x0200 | |
CREATIVE_FASTSPEECH8 = 0x0202 | |
CREATIVE_FASTSPEECH10 = 0x0203 | |
UHER_ADPCM = 0x0210 | |
ULEAD_DV_AUDIO = 0x0215 | |
ULEAD_DV_AUDIO_1 = 0x0216 | |
QUARTERDECK = 0x0220 | |
ILINK_VC = 0x0230 | |
RAW_SPORT = 0x0240 | |
ESST_AC3 = 0x0241 | |
GENERIC_PASSTHRU = 0x0249 | |
IPI_HSX = 0x0250 | |
IPI_RPELP = 0x0251 | |
CS2 = 0x0260 | |
SONY_SCX = 0x0270 | |
SONY_SCY = 0x0271 | |
SONY_ATRAC3 = 0x0272 | |
SONY_SPC = 0x0273 | |
TELUM_AUDIO = 0x0280 | |
TELUM_IA_AUDIO = 0x0281 | |
NORCOM_VOICE_SYSTEMS_ADPCM = 0x0285 | |
FM_TOWNS_SND = 0x0300 | |
MICRONAS = 0x0350 | |
MICRONAS_CELP833 = 0x0351 | |
BTV_DIGITAL = 0x0400 | |
INTEL_MUSIC_CODER = 0x0401 | |
INDEO_AUDIO = 0x0402 | |
QDESIGN_MUSIC = 0x0450 | |
ON2_VP7_AUDIO = 0x0500 | |
ON2_VP6_AUDIO = 0x0501 | |
VME_VMPCM = 0x0680 | |
TPC = 0x0681 | |
LIGHTWAVE_LOSSLESS = 0x08AE | |
OLIGSM = 0x1000 | |
OLIADPCM = 0x1001 | |
OLICELP = 0x1002 | |
OLISBC = 0x1003 | |
OLIOPR = 0x1004 | |
LH_CODEC = 0x1100 | |
LH_CODEC_CELP = 0x1101 | |
LH_CODEC_SBC8 = 0x1102 | |
LH_CODEC_SBC12 = 0x1103 | |
LH_CODEC_SBC16 = 0x1104 | |
NORRIS = 0x1400 | |
ISIAUDIO_2 = 0x1401 | |
SOUNDSPACE_MUSICOMPRESS = 0x1500 | |
MPEG_ADTS_AAC = 0x1600 | |
MPEG_RAW_AAC = 0x1601 | |
MPEG_LOAS = 0x1602 | |
NOKIA_MPEG_ADTS_AAC = 0x1608 | |
NOKIA_MPEG_RAW_AAC = 0x1609 | |
VODAFONE_MPEG_ADTS_AAC = 0x160A | |
VODAFONE_MPEG_RAW_AAC = 0x160B | |
MPEG_HEAAC = 0x1610 | |
VOXWARE_RT24_SPEECH = 0x181C | |
SONICFOUNDRY_LOSSLESS = 0x1971 | |
INNINGS_TELECOM_ADPCM = 0x1979 | |
LUCENT_SX8300P = 0x1C07 | |
LUCENT_SX5363S = 0x1C0C | |
CUSEEME = 0x1F03 | |
NTCSOFT_ALF2CM_ACM = 0x1FC4 | |
DVM = 0x2000 | |
DTS2 = 0x2001 | |
MAKEAVIS = 0x3313 | |
DIVIO_MPEG4_AAC = 0x4143 | |
NOKIA_ADAPTIVE_MULTIRATE = 0x4201 | |
DIVIO_G726 = 0x4243 | |
LEAD_SPEECH = 0x434C | |
LEAD_VORBIS = 0x564C | |
WAVPACK_AUDIO = 0x5756 | |
OGG_VORBIS_MODE_1 = 0x674F | |
OGG_VORBIS_MODE_2 = 0x6750 | |
OGG_VORBIS_MODE_3 = 0x6751 | |
OGG_VORBIS_MODE_1_PLUS = 0x676F | |
OGG_VORBIS_MODE_2_PLUS = 0x6770 | |
OGG_VORBIS_MODE_3_PLUS = 0x6771 | |
ALAC = 0x6C61 | |
_3COM_NBX = 0x7000 # Can't have leading digit | |
OPUS = 0x704F | |
FAAD_AAC = 0x706D | |
AMR_NB = 0x7361 | |
AMR_WB = 0x7362 | |
AMR_WP = 0x7363 | |
GSM_AMR_CBR = 0x7A21 | |
GSM_AMR_VBR_SID = 0x7A22 | |
COMVERSE_INFOSYS_G723_1 = 0xA100 | |
COMVERSE_INFOSYS_AVQSBC = 0xA101 | |
COMVERSE_INFOSYS_SBC = 0xA102 | |
SYMBOL_G729_A = 0xA103 | |
VOICEAGE_AMR_WB = 0xA104 | |
INGENIENT_G726 = 0xA105 | |
MPEG4_AAC = 0xA106 | |
ENCORE_G726 = 0xA107 | |
ZOLL_ASAO = 0xA108 | |
SPEEX_VOICE = 0xA109 | |
VIANIX_MASC = 0xA10A | |
WM9_SPECTRUM_ANALYZER = 0xA10B | |
WMF_SPECTRUM_ANAYZER = 0xA10C | |
GSM_610 = 0xA10D | |
GSM_620 = 0xA10E | |
GSM_660 = 0xA10F | |
GSM_690 = 0xA110 | |
GSM_ADAPTIVE_MULTIRATE_WB = 0xA111 | |
POLYCOM_G722 = 0xA112 | |
POLYCOM_G728 = 0xA113 | |
POLYCOM_G729_A = 0xA114 | |
POLYCOM_SIREN = 0xA115 | |
GLOBAL_IP_ILBC = 0xA116 | |
RADIOTIME_TIME_SHIFT_RADIO = 0xA117 | |
NICE_ACA = 0xA118 | |
NICE_ADPCM = 0xA119 | |
VOCORD_G721 = 0xA11A | |
VOCORD_G726 = 0xA11B | |
VOCORD_G722_1 = 0xA11C | |
VOCORD_G728 = 0xA11D | |
VOCORD_G729 = 0xA11E | |
VOCORD_G729_A = 0xA11F | |
VOCORD_G723_1 = 0xA120 | |
VOCORD_LBC = 0xA121 | |
NICE_G728 = 0xA122 | |
FRACE_TELECOM_G729 = 0xA123 | |
CODIAN = 0xA124 | |
FLAC = 0xF1AC | |
EXTENSIBLE = 0xFFFE | |
DEVELOPMENT = 0xFFFF | |
KNOWN_WAVE_FORMATS = {WAVE_FORMAT.PCM, WAVE_FORMAT.IEEE_FLOAT} | |
def _raise_bad_format(format_tag): | |
try: | |
format_name = WAVE_FORMAT(format_tag).name | |
except ValueError: | |
format_name = f'{format_tag:#06x}' | |
raise ValueError(f"Unknown wave file format: {format_name}. Supported " | |
"formats: " + | |
', '.join(x.name for x in KNOWN_WAVE_FORMATS)) | |
def _read_fmt_chunk(fid, is_big_endian): | |
""" | |
Returns | |
------- | |
size : int | |
size of format subchunk in bytes (minus 8 for "fmt " and itself) | |
format_tag : int | |
PCM, float, or compressed format | |
channels : int | |
number of channels | |
fs : int | |
sampling frequency in samples per second | |
bytes_per_second : int | |
overall byte rate for the file | |
block_align : int | |
bytes per sample, including all channels | |
bit_depth : int | |
bits per sample | |
Notes | |
----- | |
Assumes file pointer is immediately after the 'fmt ' id | |
""" | |
if is_big_endian: | |
fmt = '>' | |
else: | |
fmt = '<' | |
size = struct.unpack(fmt+'I', fid.read(4))[0] | |
if size < 16: | |
raise ValueError("Binary structure of wave file is not compliant") | |
res = struct.unpack(fmt+'HHIIHH', fid.read(16)) | |
bytes_read = 16 | |
format_tag, channels, fs, bytes_per_second, block_align, bit_depth = res | |
if format_tag == WAVE_FORMAT.EXTENSIBLE and size >= (16+2): | |
ext_chunk_size = struct.unpack(fmt+'H', fid.read(2))[0] | |
bytes_read += 2 | |
if ext_chunk_size >= 22: | |
extensible_chunk_data = fid.read(22) | |
bytes_read += 22 | |
raw_guid = extensible_chunk_data[2+4:2+4+16] | |
# GUID template {XXXXXXXX-0000-0010-8000-00AA00389B71} (RFC-2361) | |
# MS GUID byte order: first three groups are native byte order, | |
# rest is Big Endian | |
if is_big_endian: | |
tail = b'\x00\x00\x00\x10\x80\x00\x00\xAA\x00\x38\x9B\x71' | |
else: | |
tail = b'\x00\x00\x10\x00\x80\x00\x00\xAA\x00\x38\x9B\x71' | |
if raw_guid.endswith(tail): | |
format_tag = struct.unpack(fmt+'I', raw_guid[:4])[0] | |
else: | |
raise ValueError("Binary structure of wave file is not compliant") | |
if format_tag not in KNOWN_WAVE_FORMATS: | |
_raise_bad_format(format_tag) | |
# move file pointer to next chunk | |
if size > bytes_read: | |
fid.read(size - bytes_read) | |
# fmt should always be 16, 18 or 40, but handle it just in case | |
_handle_pad_byte(fid, size) | |
return (size, format_tag, channels, fs, bytes_per_second, block_align, | |
bit_depth) | |
def _read_data_chunk(fid, format_tag, channels, bit_depth, is_big_endian, | |
block_align, mmap=False): | |
""" | |
Notes | |
----- | |
Assumes file pointer is immediately after the 'data' id | |
It's possible to not use all available bits in a container, or to store | |
samples in a container bigger than necessary, so bytes_per_sample uses | |
the actual reported container size (nBlockAlign / nChannels). Real-world | |
examples: | |
Adobe Audition's "24-bit packed int (type 1, 20-bit)" | |
nChannels = 2, nBlockAlign = 6, wBitsPerSample = 20 | |
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-int12-AFsp.wav | |
is: | |
nChannels = 2, nBlockAlign = 4, wBitsPerSample = 12 | |
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Docs/multichaudP.pdf | |
gives an example of: | |
nChannels = 2, nBlockAlign = 8, wBitsPerSample = 20 | |
""" | |
if is_big_endian: | |
fmt = '>' | |
else: | |
fmt = '<' | |
# Size of the data subchunk in bytes | |
size = struct.unpack(fmt+'I', fid.read(4))[0] | |
# Number of bytes per sample (sample container size) | |
bytes_per_sample = block_align // channels | |
n_samples = size // bytes_per_sample | |
if format_tag == WAVE_FORMAT.PCM: | |
if 1 <= bit_depth <= 8: | |
dtype = 'u1' # WAV of 8-bit integer or less are unsigned | |
elif bytes_per_sample in {3, 5, 6, 7}: | |
# No compatible dtype. Load as raw bytes for reshaping later. | |
dtype = 'V1' | |
elif bit_depth <= 64: | |
# Remaining bit depths can map directly to signed numpy dtypes | |
dtype = f'{fmt}i{bytes_per_sample}' | |
else: | |
raise ValueError("Unsupported bit depth: the WAV file " | |
f"has {bit_depth}-bit integer data.") | |
elif format_tag == WAVE_FORMAT.IEEE_FLOAT: | |
if bit_depth in {32, 64}: | |
dtype = f'{fmt}f{bytes_per_sample}' | |
else: | |
raise ValueError("Unsupported bit depth: the WAV file " | |
f"has {bit_depth}-bit floating-point data.") | |
else: | |
_raise_bad_format(format_tag) | |
start = fid.tell() | |
if not mmap: | |
try: | |
count = size if dtype == 'V1' else n_samples | |
data = np.fromfile(fid, dtype=dtype, count=count) | |
except io.UnsupportedOperation: # not a C-like file | |
fid.seek(start, 0) # just in case it seeked, though it shouldn't | |
data = np.frombuffer(fid.read(size), dtype=dtype) | |
if dtype == 'V1': | |
# Rearrange raw bytes into smallest compatible numpy dtype | |
dt = f'{fmt}i4' if bytes_per_sample == 3 else f'{fmt}i8' | |
a = np.zeros((len(data) // bytes_per_sample, np.dtype(dt).itemsize), | |
dtype='V1') | |
if is_big_endian: | |
a[:, :bytes_per_sample] = data.reshape((-1, bytes_per_sample)) | |
else: | |
a[:, -bytes_per_sample:] = data.reshape((-1, bytes_per_sample)) | |
data = a.view(dt).reshape(a.shape[:-1]) | |
else: | |
if bytes_per_sample in {1, 2, 4, 8}: | |
start = fid.tell() | |
data = np.memmap(fid, dtype=dtype, mode='c', offset=start, | |
shape=(n_samples,)) | |
fid.seek(start + size) | |
else: | |
raise ValueError("mmap=True not compatible with " | |
f"{bytes_per_sample}-byte container size.") | |
_handle_pad_byte(fid, size) | |
if channels > 1: | |
data = data.reshape(-1, channels) | |
return data | |
def _skip_unknown_chunk(fid, is_big_endian): | |
if is_big_endian: | |
fmt = '>I' | |
else: | |
fmt = '<I' | |
data = fid.read(4) | |
# call unpack() and seek() only if we have really read data from file | |
# otherwise empty read at the end of the file would trigger | |
# unnecessary exception at unpack() call | |
# in case data equals somehow to 0, there is no need for seek() anyway | |
if data: | |
size = struct.unpack(fmt, data)[0] | |
fid.seek(size, 1) | |
_handle_pad_byte(fid, size) | |
def _read_riff_chunk(fid): | |
str1 = fid.read(4) # File signature | |
if str1 == b'RIFF': | |
is_big_endian = False | |
fmt = '<I' | |
elif str1 == b'RIFX': | |
is_big_endian = True | |
fmt = '>I' | |
else: | |
# There are also .wav files with "FFIR" or "XFIR" signatures? | |
raise ValueError(f"File format {repr(str1)} not understood. Only " | |
"'RIFF' and 'RIFX' supported.") | |
# Size of entire file | |
file_size = struct.unpack(fmt, fid.read(4))[0] + 8 | |
str2 = fid.read(4) | |
if str2 != b'WAVE': | |
raise ValueError(f"Not a WAV file. RIFF form type is {repr(str2)}.") | |
return file_size, is_big_endian | |
def _handle_pad_byte(fid, size): | |
# "If the chunk size is an odd number of bytes, a pad byte with value zero | |
# is written after ckData." So we need to seek past this after each chunk. | |
if size % 2: | |
fid.seek(1, 1) | |
def read(filename, mmap=False): | |
""" | |
Open a WAV file. | |
Return the sample rate (in samples/sec) and data from an LPCM WAV file. | |
Parameters | |
---------- | |
filename : string or open file handle | |
Input WAV file. | |
mmap : bool, optional | |
Whether to read data as memory-mapped (default: False). Not compatible | |
with some bit depths; see Notes. Only to be used on real files. | |
.. versionadded:: 0.12.0 | |
Returns | |
------- | |
rate : int | |
Sample rate of WAV file. | |
data : numpy array | |
Data read from WAV file. Data-type is determined from the file; | |
see Notes. Data is 1-D for 1-channel WAV, or 2-D of shape | |
(Nsamples, Nchannels) otherwise. If a file-like input without a | |
C-like file descriptor (e.g., :class:`python:io.BytesIO`) is | |
passed, this will not be writeable. | |
Notes | |
----- | |
Common data types: [1]_ | |
===================== =========== =========== ============= | |
WAV format Min Max NumPy dtype | |
===================== =========== =========== ============= | |
32-bit floating-point -1.0 +1.0 float32 | |
32-bit integer PCM -2147483648 +2147483647 int32 | |
24-bit integer PCM -2147483648 +2147483392 int32 | |
16-bit integer PCM -32768 +32767 int16 | |
8-bit integer PCM 0 255 uint8 | |
===================== =========== =========== ============= | |
WAV files can specify arbitrary bit depth, and this function supports | |
reading any integer PCM depth from 1 to 64 bits. Data is returned in the | |
smallest compatible numpy int type, in left-justified format. 8-bit and | |
lower is unsigned, while 9-bit and higher is signed. | |
For example, 24-bit data will be stored as int32, with the MSB of the | |
24-bit data stored at the MSB of the int32, and typically the least | |
significant byte is 0x00. (However, if a file actually contains data past | |
its specified bit depth, those bits will be read and output, too. [2]_) | |
This bit justification and sign matches WAV's native internal format, which | |
allows memory mapping of WAV files that use 1, 2, 4, or 8 bytes per sample | |
(so 24-bit files cannot be memory-mapped, but 32-bit can). | |
IEEE float PCM in 32- or 64-bit format is supported, with or without mmap. | |
Values exceeding [-1, +1] are not clipped. | |
Non-linear PCM (mu-law, A-law) is not supported. | |
References | |
---------- | |
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming | |
Interface and Data Specifications 1.0", section "Data Format of the | |
Samples", August 1991 | |
http://www.tactilemedia.com/info/MCI_Control_Info.html | |
.. [2] Adobe Systems Incorporated, "Adobe Audition 3 User Guide", section | |
"Audio file formats: 24-bit Packed Int (type 1, 20-bit)", 2007 | |
Examples | |
-------- | |
>>> from os.path import dirname, join as pjoin | |
>>> from scipy.io import wavfile | |
>>> import scipy.io | |
Get the filename for an example .wav file from the tests/data directory. | |
>>> data_dir = pjoin(dirname(scipy.io.__file__), 'tests', 'data') | |
>>> wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav') | |
Load the .wav file contents. | |
>>> samplerate, data = wavfile.read(wav_fname) | |
>>> print(f"number of channels = {data.shape[1]}") | |
number of channels = 2 | |
>>> length = data.shape[0] / samplerate | |
>>> print(f"length = {length}s") | |
length = 0.01s | |
Plot the waveform. | |
>>> import matplotlib.pyplot as plt | |
>>> import numpy as np | |
>>> time = np.linspace(0., length, data.shape[0]) | |
>>> plt.plot(time, data[:, 0], label="Left channel") | |
>>> plt.plot(time, data[:, 1], label="Right channel") | |
>>> plt.legend() | |
>>> plt.xlabel("Time [s]") | |
>>> plt.ylabel("Amplitude") | |
>>> plt.show() | |
""" | |
if hasattr(filename, 'read'): | |
fid = filename | |
mmap = False | |
else: | |
fid = open(filename, 'rb') | |
try: | |
file_size, is_big_endian = _read_riff_chunk(fid) | |
fmt_chunk_received = False | |
data_chunk_received = False | |
while fid.tell() < file_size: | |
# read the next chunk | |
chunk_id = fid.read(4) | |
if not chunk_id: | |
if data_chunk_received: | |
# End of file but data successfully read | |
warnings.warn( | |
"Reached EOF prematurely; finished at {:d} bytes, " | |
"expected {:d} bytes from header." | |
.format(fid.tell(), file_size), | |
WavFileWarning, stacklevel=2) | |
break | |
else: | |
raise ValueError("Unexpected end of file.") | |
elif len(chunk_id) < 4: | |
msg = f"Incomplete chunk ID: {repr(chunk_id)}" | |
# If we have the data, ignore the broken chunk | |
if fmt_chunk_received and data_chunk_received: | |
warnings.warn(msg + ", ignoring it.", WavFileWarning, | |
stacklevel=2) | |
else: | |
raise ValueError(msg) | |
if chunk_id == b'fmt ': | |
fmt_chunk_received = True | |
fmt_chunk = _read_fmt_chunk(fid, is_big_endian) | |
format_tag, channels, fs = fmt_chunk[1:4] | |
bit_depth = fmt_chunk[6] | |
block_align = fmt_chunk[5] | |
elif chunk_id == b'fact': | |
_skip_unknown_chunk(fid, is_big_endian) | |
elif chunk_id == b'data': | |
data_chunk_received = True | |
if not fmt_chunk_received: | |
raise ValueError("No fmt chunk before data") | |
data = _read_data_chunk(fid, format_tag, channels, bit_depth, | |
is_big_endian, block_align, mmap) | |
elif chunk_id == b'LIST': | |
# Someday this could be handled properly but for now skip it | |
_skip_unknown_chunk(fid, is_big_endian) | |
elif chunk_id in {b'JUNK', b'Fake'}: | |
# Skip alignment chunks without warning | |
_skip_unknown_chunk(fid, is_big_endian) | |
else: | |
warnings.warn("Chunk (non-data) not understood, skipping it.", | |
WavFileWarning, stacklevel=2) | |
_skip_unknown_chunk(fid, is_big_endian) | |
finally: | |
if not hasattr(filename, 'read'): | |
fid.close() | |
else: | |
fid.seek(0) | |
return fs, data | |
def write(filename, rate, data, markers=False, verbose=True): | |
""" | |
Write a NumPy array as a WAV file. | |
Parameters | |
---------- | |
filename : string or open file handle | |
Output wav file. | |
rate : int | |
The sample rate (in samples/sec). | |
data : ndarray | |
A 1-D or 2-D NumPy array of either integer or float data-type. | |
Notes | |
----- | |
* Writes a simple uncompressed WAV file. | |
* To write multiple-channels, use a 2-D array of shape | |
(Nsamples, Nchannels). | |
* The bits-per-sample and PCM/float will be determined by the data-type. | |
Common data types: [1]_ | |
===================== =========== =========== ============= | |
WAV format Min Max NumPy dtype | |
===================== =========== =========== ============= | |
32-bit floating-point -1.0 +1.0 float32 | |
32-bit PCM -2147483648 +2147483647 int32 | |
16-bit PCM -32768 +32767 int16 | |
8-bit PCM 0 255 uint8 | |
===================== =========== =========== ============= | |
Note that 8-bit PCM is unsigned. | |
References | |
---------- | |
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming | |
Interface and Data Specifications 1.0", section "Data Format of the | |
Samples", August 1991 | |
http://www.tactilemedia.com/info/MCI_Control_Info.html | |
Examples | |
-------- | |
Create a 100Hz sine wave, sampled at 44100Hz. | |
Write to 16-bit PCM, Mono. | |
>>> from scipy.io.wavfile import write | |
>>> samplerate = 44100; fs = 100 | |
>>> t = np.linspace(0., 1., samplerate) | |
>>> amplitude = np.iinfo(np.int16).max | |
>>> data = amplitude * np.sin(2. * np.pi * fs * t) | |
>>> write("example.wav", samplerate, data.astype(np.int16)) | |
""" | |
if hasattr(filename, 'write'): | |
fid = filename | |
else: | |
fid = open(filename, 'wb') | |
fs = rate | |
try: | |
dkind = data.dtype.kind | |
if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and | |
data.dtype.itemsize == 1)): | |
raise ValueError("Unsupported data type '%s'" % data.dtype) | |
header_data = b'' | |
header_data += b'RIFF' | |
header_data += b'\x00\x00\x00\x00' | |
header_data += b'WAVE' | |
# fmt chunk | |
header_data += b'fmt ' | |
if dkind == 'f': | |
format_tag = WAVE_FORMAT.IEEE_FLOAT | |
else: | |
format_tag = WAVE_FORMAT.PCM | |
if data.ndim == 1: | |
channels = 1 | |
else: | |
channels = data.shape[1] | |
bit_depth = data.dtype.itemsize * 8 | |
bytes_per_second = fs*(bit_depth // 8)*channels | |
block_align = channels * (bit_depth // 8) | |
fmt_chunk_data = struct.pack('<HHIIHH', format_tag, channels, fs, | |
bytes_per_second, block_align, bit_depth) | |
if not (dkind == 'i' or dkind == 'u'): | |
# add cbSize field for non-PCM files | |
fmt_chunk_data += b'\x00\x00' | |
header_data += struct.pack('<I', len(fmt_chunk_data)) | |
header_data += fmt_chunk_data | |
# fact chunk (non-PCM files) | |
if not (dkind == 'i' or dkind == 'u'): | |
header_data += b'fact' | |
header_data += struct.pack('<II', 4, data.shape[0]) | |
# check data size (needs to be immediately before the data chunk) | |
if ((len(header_data)-4-4) + (4+4+data.nbytes)) > 0xFFFFFFFF: | |
raise ValueError("Data exceeds wave file size limit") | |
fid.write(header_data) | |
# data chunk | |
fid.write(b'data') | |
fid.write(struct.pack('<I', data.nbytes)) | |
if data.dtype.byteorder == '>' or (data.dtype.byteorder == '=' and | |
sys.byteorder == 'big'): | |
data = data.byteswap() | |
_array_tofile(fid, data) | |
# cue/marker chunk | |
if markers: # != None and != [] | |
if verbose: | |
print("Saving cue markers...") | |
if isinstance(markers[0], dict):# then we have [{'position': 100, 'label': 'marker1'}, ...] | |
labels = [m['label'] for m in markers] | |
markers = [m['position'] for m in markers] | |
else: | |
labels = [b'' for m in markers] | |
fid.write(b'cue ') | |
size = 4 + len(markers) * 24 | |
fid.write(struct.pack('<ii', size, len(markers))) | |
for i, c in enumerate(markers): | |
s = struct.pack('<iiiiii', i + 1, c, 1635017060, 0, 0, c)# 1635017060 is struct.unpack('<i',b'data') | |
fid.write(s) | |
lbls = b'' | |
for i, lbl in enumerate(labels): | |
lbls += b'labl' | |
label = lbl + ('\x00' if len(lbl) % 2 == 1 else '\x00\x00') | |
size = len(lbl) + 1 + 4 # because \x00 | |
lbls += struct.pack('<ii', size, i + 1) | |
lbls += bytes(label, encoding='ascii') | |
fid.write(b'LIST') | |
size = len(lbls) + 4 | |
fid.write(struct.pack('<i', size)) | |
fid.write(b'adtl') | |
fid.write(lbls) | |
# Determine file size and place it in correct | |
# position at start of the file. | |
size = fid.tell() | |
fid.seek(4) | |
fid.write(struct.pack('<I', size-8)) | |
finally: | |
if not hasattr(filename, 'write'): | |
fid.close() | |
else: | |
fid.seek(0) | |
def _array_tofile(fid, data): | |
# ravel gives a c-contiguous buffer | |
fid.write(data.ravel().view('b').data) | |
def test_normalized(array): | |
''' | |
Determine if an array is entirely -1 < array[i,j] < 1, to see if array is | |
normalized | |
''' | |
return (array > -1).all() and (array < 1).all() | |
def norm_to_32float(array): | |
''' | |
Convert a variety of audio types to float32 while normalizing if needed | |
''' | |
if array.dtype == 'int16': | |
bits=16 | |
normfactor = 2 ** (bits-1) | |
data = np.float32(array) * 1.0 / normfactor | |
if array.dtype == 'int32': | |
bits=32 | |
normfactor = 2 ** (bits-1) | |
data = np.float32(array) * 1.0 / normfactor | |
if array.dtype == 'float32': | |
if test_normalized(array): | |
data = np.float32(array) # nothing needed | |
else: | |
bits=32 | |
normfactor = 2 ** (bits-1) | |
data = np.float32(array) * 1.0 / normfactor | |
if array.dtype == 'float64': | |
bits=64 | |
normfactor = 2 ** (bits-1) | |
data = np.float32(array) * 1.0 / normfactor | |
elif array.dtype == 'uint8': | |
if isinstance(data[0], (int, np.uint8)): | |
bits=8 | |
# handle uint8 data by shifting to center at 0 | |
normfactor = 2 ** (bits-1) | |
data = (np.float32(array) * 1.0 / normfactor) -\ | |
((normfactor)/(normfactor-1)) | |
return data | |
def onset_splice_superflux(audiofile): | |
''' | |
Superflux onset detection method of Boeck and Widmer [2013], modified to | |
use backtracking to get accurate splice location. | |
From: | |
https://librosa.github.io/librosa/auto_examples/plot_superflux.html#sphx-glr-auto-examples-plot-superflux-py | |
''' | |
# recommended constants directly from paper | |
y, sr = librosa.load(audiofile,sr=44100) | |
n_fft = 1024 | |
hop_length = int(librosa.time_to_samples(1./200, sr=sr)) | |
lag = 2 # number of frames | |
n_mels = 138 # number of bins | |
fmin = 27.5 # lowest frequency | |
fmax = 16000. #highest frequency | |
max_size = 3 | |
# Mel spectrogram | |
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, | |
hop_length=hop_length, | |
fmin=fmin, | |
fmax=fmax, | |
n_mels=n_mels) | |
# Onset Strength Function | |
odf_sf = librosa.onset.onset_strength(S=librosa.power_to_db(S, ref=np.max), | |
sr=sr, | |
hop_length=hop_length, | |
lag=lag, max_size=max_size) | |
# Onset locations in time | |
onset_sf = librosa.onset.onset_detect(onset_envelope=odf_sf, | |
sr=sr, | |
hop_length=hop_length, | |
units='time', | |
backtrack=True) | |
return onset_sf | |
def retain_n_splice_markers(onset_sf, splicecount): | |
''' | |
modified from @w-winter on github | |
Take larger set of generated splice points and select [splicecount] | |
number of them. | |
Useful for when the automatically generated number of splices exceeds | |
limits (300). | |
''' | |
k, m = divmod(len(onset_sf), splicecount) | |
if splicecount < len(onset_sf): | |
splice_markers = list(onset_sf[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(splicecount)) | |
splice_markers[0] = [0.0] | |
return np.array([x[0] for x in splice_markers]) | |
else: | |
print('More desired splices than available splices, defaulting to librosa output') | |
return onset_sf | |
def change_samplerate_interp(old_audio,old_rate,new_rate): | |
''' | |
Change sample rate to new sample rate by simple interpolation. | |
If old_rate > new_rate, there may be aliasing / data loss. | |
Input should be in column format, as the interpolation will be completed | |
on each channel this way. | |
Modified from: | |
https://stackoverflow.com/questions/33682490/how-to-read-a-wav-file-using-scipy-at-a-different-sampling-rate | |
''' | |
if old_rate != new_rate: | |
# duration of audio | |
duration = old_audio.shape[0] / old_rate | |
# length of old and new audio | |
time_old = np.linspace(0, duration, old_audio.shape[0]) | |
time_new = np.linspace(0, duration, int(old_audio.shape[0] * new_rate / old_rate)) | |
# fit old_audio into new_audio length by interpolation | |
interpolator = interpolate.interp1d(time_old, old_audio.T) | |
new_audio = interpolator(time_new).T | |
return new_audio | |
else: | |
print('Conversion not needed, old and new rates match') | |
return old_audio # conversion not needed | |
def main(argv): | |
inputwavefile = '' | |
outputfile = '' | |
splicecount = [] | |
try: | |
opts, args = getopt.getopt(argv,"hw:o:s:",["wavfile=","outputfile=","splicecount="]) | |
except getopt.GetoptError: | |
print('Error in usage, correct format:\n'+\ | |
'morphagene_onset.py -w <inputwavfile> -o <outputfile> -s <splicecount>') | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt == '-h': | |
print('Morphagene reel creation using Superflux onset detection:\n'+\ | |
'morphagene_onset.py -w <inputwavfile> -o <outputfile> -s <splicecount>\n'+\ | |
'"-s" is useful for avoiding the 300-splice limit of the Morphagene.\n'+\ | |
'If you would rather bypass this, use a number >300 here.') | |
sys.exit() | |
elif opt in ("-w", "--wavfile"): | |
inputwavefile = arg | |
elif opt in ("-o", "--outputfile"): | |
outputfile = arg | |
elif opt in ("-s", "--splicecount"): | |
splicecount = int(arg) | |
print(f'Input wave file: {inputwavefile}') | |
print(f'Output Morphagene reel: {outputfile}') | |
print(f'Number of selected splices: {splicecount}') | |
########################################################################### | |
''' | |
Write single file, with splice locations using the Superflux onset | |
detection algorithm with backtracking for optimal splice location. | |
''' | |
########################################################################### | |
morph_srate = 48000 # required samplerate for Morphagene | |
# generate labels and time in seconds of splices using librosa | |
librosa_sec = retain_n_splice_markers(np.unique(onset_splice_superflux(inputwavefile)), splicecount) | |
# read pertinent info from audio file, convert to correct 32-bit float, | |
# exit if input wave file is broken | |
try: | |
sample_rate, array = read(inputwavefile) | |
array = norm_to_32float(array) | |
except: | |
print(f'Input file {inputwavefile}.wav is poorly formatted, exiting') | |
sys.exit() | |
if array.ndim < 2: # correct mono | |
print('Correcting mono to stereo') | |
array = np.vstack((array,array)).T | |
# check if input wav has a different rate than desired Morphagene rate, | |
# and correct by interpolation | |
if sample_rate != morph_srate: | |
print(f"Correcting input sample rate {sample_rate}Hz to Morphagene rate {morph_srate}Hz") | |
# perform interpolation on each channel, then transpose back | |
array = change_samplerate_interp(array,float(sample_rate),float(morph_srate)).T | |
# convert labels in seconds to labels in frames, adjusting for change | |
# in rate | |
sc = float(morph_srate) / float(sample_rate) | |
frame_labs = (librosa_sec * sample_rate * sc).astype(int) | |
else: | |
frame_labs = (librosa_sec * sample_rate).astype(int) | |
frame_dict = [{'position': l, 'label': 'marker%i'%(i+1)} for i,l in enumerate(frame_labs)] | |
# warnings about morphagene limitations | |
if len(frame_dict) > 300 or (array.shape[1]/morph_srate)/60. > 2.9: | |
raise ValueError(f'Number of splices ({len(frame_dict)}) and/or audio'+ \ | |
f' length ({(array.shape[1]/morph_srate)/60.} minutes)' + \ | |
' exceed Morphagene limits [300 splices / 2.9 minutes]') | |
# write wav file with additional cue markers from labels | |
write(outputfile,morph_srate,array.astype('float32'),markers=frame_dict) | |
print(f'Saved Morphagene reel with {len(frame_labs)} splices: {outputfile}') | |
name = os.path.splitext(inputwavefile)[0] | |
np.savetxt(f'{name}.txt',librosa_sec,fmt='%03.6f',delimiter='\t') | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First, thank you for making this tool, it's an awesome piece of code. Would there be anyway to automatically detect the number of beats or silences and make the splices for you, and then fall back to the manual value if it exceeds a particular amount? Ultimately, it would be awesome to point this utility at a directory of wav files and have them chopped up without all of them having the same number of splices and this is the idea I came up with :).