Skip to content

Instantly share code, notes, and snippets.

@NickLarsen
Created November 8, 2019 20:43
Show Gist options
  • Save NickLarsen/1826b3aad5f72e8f6f84d62e138fab84 to your computer and use it in GitHub Desktop.
Save NickLarsen/1826b3aad5f72e8f6f84d62e138fab84 to your computer and use it in GitHub Desktop.
# inspiration from https://github.com/devicehive/devicehive-audio-analysis
import sys
import pyaudio
import numpy as np
import wave
import os
import tensorflow as tf
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim
from datetime import datetime
def log(message):
time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(time_str + ": " + message)
class ListenerDevice:
def __init__(self, device_index):
self.audio = pyaudio.PyAudio()
self.listeners = []
if device_index is None:
device_info = self.audio.get_default_input_device_info()
else:
device_info = self.audio.get_device_info_by_index(device_index)
self.audio_properties = {
"index": device_info["index"],
"name": device_info["name"],
"sample_width": self.audio.get_sample_size(pyaudio.paInt16),
"sample_rate": int(device_info["defaultSampleRate"]),
"channels": 1,
}
print("selected audio device: ", self.audio_properties)
def __del__(self):
self.audio.terminate()
def list_devices(self):
audio = self.audio
default_index = audio.get_default_input_device_info()["index"]
inputs = [
audio.get_device_info_by_index(i)
for i in range(audio.get_device_count())
if audio.get_device_info_by_index(i).get('maxInputChannels') > 0
]
for device in inputs:
if (device["index"] == default_index):
print(str(device["index"]) + " (default): " + device["name"])
else:
print(str(device["index"]) + ": " + device["name"])
def add_listener(self, listener):
self.listeners.append(listener)
def listen(self, device_index = None):
record_stream = self.audio.open(
format=pyaudio.paInt16,
channels=self.audio_properties["channels"],
rate=self.audio_properties["sample_rate"],
frames_per_buffer=self.audio_properties["sample_rate"],
input=True,
stream_callback=self._forward_audio_data,
input_device_index=self.audio_properties["index"],
)
for listener in self.listeners:
listener.start()
record_stream.start_stream()
input("Press Enter to stop recording...")
record_stream.stop_stream()
record_stream.close()
for listener in self.listeners:
listener.stop()
def _forward_audio_data(self, in_data, frame_count, time_info, status):
data = np.frombuffer(in_data, dtype=np.int16, count=frame_count)
for listener in self.listeners:
listener.process(data)
return (in_data, pyaudio.paContinue)
class ListenerWavSave:
def __init__(self, audio_properties, filename):
self.audio_properties = audio_properties
self.filename = filename
def start(self):
output_file = wave.open(self.filename, "wb")
output_file.setnchannels(self.audio_properties["channels"])
output_file.setsampwidth(self.audio_properties["sample_width"])
output_file.setframerate(self.audio_properties["sample_rate"])
self.output_file = output_file
def stop(self):
self.output_file.close()
def process(self, audio_data):
self.output_file.writeframes(audio_data)
class ListenerDogMoodDetector:
def __init__(self, audio_properties, model_path = ""):
self.audio_properties = audio_properties
self.pproc = vggish_postprocess.Postprocessor(os.path.join(model_path, "vggish_pca_params.npz"))
#vggish
graph_vgg = tf.Graph()
with graph_vgg.as_default():
self.vgg_sess = tf.Session()
vggish_slim.define_vggish_slim(training=False)
vggish_slim.load_vggish_slim_checkpoint(self.vgg_sess, os.path.join(model_path, "vggish_model.ckpt"))
self.features_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
self.embedding_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
graph_yt = tf.Graph()
with graph_yt.as_default():
self.model_sess = tf.Session()
model_file = os.path.join(model_path, "model.ckpt")
saver = tf.train.import_meta_graph(model_file + ".meta", clear_devices=True, import_scope='m2')
saver.restore(self.model_sess, model_file)
init_op_list = []
variables = tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES)
for variable in list(variables):
if "train_input" in variable.name:
init_op_list.append(tf.assign(variable, 1))
variables.remove(variable)
init_op_list.append(tf.variables_initializer(variables))
self.model_sess.run(init_op_list)
self.input_batch_raw_tensor = self.model_sess.graph.get_collection("input_batch_raw")[0]
self.num_frames_tensor = self.model_sess.graph.get_collection("num_frames")[0]
self.predictions_tensor = self.model_sess.graph.get_collection("predictions")[0]
self.labels = {
0: "Speech",
1: "Male speech, man speaking",
2: "Female speech, woman speaking",
3: "Child speech, kid speaking",
4: "Conversation",
5: "Narration, monologue",
6: "Babbling",
7: "Speech synthesizer",
8: "Shout",
9: "Bellow",
10: "Whoop",
11: "Yell",
12: "Battle cry",
13: "Children shouting",
14: "Screaming",
15: "Whispering",
16: "Laughter",
17: "Baby laughter",
18: "Giggle",
19: "Snicker",
20: "Belly laugh",
21: "Chuckle, chortle",
22: "Crying, sobbing",
23: "Baby cry, infant cry",
24: "Whimper",
25: "Wail, moan",
26: "Sigh",
27: "Singing",
28: "Choir",
29: "Yodeling",
30: "Chant",
31: "Mantra",
32: "Male singing",
33: "Female singing",
34: "Child singing",
35: "Synthetic singing",
36: "Rapping",
37: "Humming",
38: "Groan",
39: "Grunt",
40: "Whistling",
41: "Breathing",
42: "Wheeze",
43: "Snoring",
44: "Gasp",
45: "Pant",
46: "Snort",
47: "Cough",
48: "Throat clearing",
49: "Sneeze",
50: "Sniff",
51: "Run",
52: "Shuffle",
53: "Walk, footsteps",
54: "Chewing, mastication",
55: "Biting",
56: "Gargling",
57: "Stomach rumble",
58: "Burping, eructation",
59: "Hiccup",
60: "Fart",
61: "Hands",
62: "Finger snapping",
63: "Clapping",
64: "Heart sounds, heartbeat",
65: "Heart murmur",
66: "Cheering",
67: "Applause",
68: "Chatter",
69: "Crowd",
70: "Hubbub, speech noise, speech babble",
71: "Children playing",
72: "Animal",
73: "Domestic animals, pets",
74: "Dog",
75: "Bark",
76: "Yip",
77: "Howl",
78: "Bow-wow",
79: "Growling",
80: "Whimper (dog)",
81: "Cat",
82: "Purr",
83: "Meow",
84: "Hiss",
85: "Caterwaul",
86: "Livestock, farm animals, working animals",
87: "Horse",
88: "Clip-clop",
89: "Neigh, whinny",
90: "Cattle, bovinae",
91: "Moo",
92: "Cowbell",
93: "Pig",
94: "Oink",
95: "Goat",
96: "Bleat",
97: "Sheep",
98: "Fowl",
99: "Chicken, rooster",
100: "Cluck",
101: "Crowing, cock-a-doodle-doo",
102: "Turkey",
103: "Gobble",
104: "Duck",
105: "Quack",
106: "Goose",
107: "Honk",
108: "Wild animals",
109: "Roaring cats (lions, tigers)",
110: "Roar",
111: "Bird",
112: "Bird vocalization, bird call, bird song",
113: "Chirp, tweet",
114: "Squawk",
115: "Pigeon, dove",
116: "Coo",
117: "Crow",
118: "Caw",
119: "Owl",
120: "Hoot",
121: "Bird flight, flapping wings",
122: "Canidae, dogs, wolves",
123: "Rodents, rats, mice",
124: "Mouse",
125: "Patter",
126: "Insect",
127: "Cricket",
128: "Mosquito",
129: "Fly, housefly",
130: "Buzz",
131: "Bee, wasp, etc.",
132: "Frog",
133: "Croak",
134: "Snake",
135: "Rattle",
136: "Whale vocalization",
137: "Music",
138: "Musical instrument",
139: "Plucked string instrument",
140: "Guitar",
141: "Electric guitar",
142: "Bass guitar",
143: "Acoustic guitar",
144: "Steel guitar, slide guitar",
145: "Tapping (guitar technique)",
146: "Strum",
147: "Banjo",
148: "Sitar",
149: "Mandolin",
150: "Zither",
151: "Ukulele",
152: "Keyboard (musical)",
153: "Piano",
154: "Electric piano",
155: "Organ",
156: "Electronic organ",
157: "Hammond organ",
158: "Synthesizer",
159: "Sampler",
160: "Harpsichord",
161: "Percussion",
162: "Drum kit",
163: "Drum machine",
164: "Drum",
165: "Snare drum",
166: "Rimshot",
167: "Drum roll",
168: "Bass drum",
169: "Timpani",
170: "Tabla",
171: "Cymbal",
172: "Hi-hat",
173: "Wood block",
174: "Tambourine",
175: "Rattle (instrument)",
176: "Maraca",
177: "Gong",
178: "Tubular bells",
179: "Mallet percussion",
180: "Marimba, xylophone",
181: "Glockenspiel",
182: "Vibraphone",
183: "Steelpan",
184: "Orchestra",
185: "Brass instrument",
186: "French horn",
187: "Trumpet",
188: "Trombone",
189: "Bowed string instrument",
190: "String section",
191: "Violin, fiddle",
192: "Pizzicato",
193: "Cello",
194: "Double bass",
195: "Wind instrument, woodwind instrument",
196: "Flute",
197: "Saxophone",
198: "Clarinet",
199: "Harp",
200: "Bell",
201: "Church bell",
202: "Jingle bell",
203: "Bicycle bell",
204: "Tuning fork",
205: "Chime",
206: "Wind chime",
207: "Change ringing (campanology)",
208: "Harmonica",
209: "Accordion",
210: "Bagpipes",
211: "Didgeridoo",
212: "Shofar",
213: "Theremin",
214: "Singing bowl",
215: "Scratching (performance technique)",
216: "Pop music",
217: "Hip hop music",
218: "Beatboxing",
219: "Rock music",
220: "Heavy metal",
221: "Punk rock",
222: "Grunge",
223: "Progressive rock",
224: "Rock and roll",
225: "Psychedelic rock",
226: "Rhythm and blues",
227: "Soul music",
228: "Reggae",
229: "Country",
230: "Swing music",
231: "Bluegrass",
232: "Funk",
233: "Folk music",
234: "Middle Eastern music",
235: "Jazz",
236: "Disco",
237: "Classical music",
238: "Opera",
239: "Electronic music",
240: "House music",
241: "Techno",
242: "Dubstep",
243: "Drum and bass",
244: "Electronica",
245: "Electronic dance music",
246: "Ambient music",
247: "Trance music",
248: "Music of Latin America",
249: "Salsa music",
250: "Flamenco",
251: "Blues",
252: "Music for children",
253: "New-age music",
254: "Vocal music",
255: "A capella",
256: "Music of Africa",
257: "Afrobeat",
258: "Christian music",
259: "Gospel music",
260: "Music of Asia",
261: "Carnatic music",
262: "Music of Bollywood",
263: "Ska",
264: "Traditional music",
265: "Independent music",
266: "Song",
267: "Background music",
268: "Theme music",
269: "Jingle (music)",
270: "Soundtrack music",
271: "Lullaby",
272: "Video game music",
273: "Christmas music",
274: "Dance music",
275: "Wedding music",
276: "Happy music",
277: "Funny music",
278: "Sad music",
279: "Tender music",
280: "Exciting music",
281: "Angry music",
282: "Scary music",
283: "Wind",
284: "Rustling leaves",
285: "Wind noise (microphone)",
286: "Thunderstorm",
287: "Thunder",
288: "Water",
289: "Rain",
290: "Raindrop",
291: "Rain on surface",
292: "Stream",
293: "Waterfall",
294: "Ocean",
295: "Waves, surf",
296: "Steam",
297: "Gurgling",
298: "Fire",
299: "Crackle",
300: "Vehicle",
301: "Boat, Water vehicle",
302: "Sailboat, sailing ship",
303: "Rowboat, canoe, kayak",
304: "Motorboat, speedboat",
305: "Ship",
306: "Motor vehicle (road)",
307: "Car",
308: "Vehicle horn, car horn, honking",
309: "Toot",
310: "Car alarm",
311: "Power windows, electric windows",
312: "Skidding",
313: "Tire squeal",
314: "Car passing by",
315: "Race car, auto racing",
316: "Truck",
317: "Air brake",
318: "Air horn, truck horn",
319: "Reversing beeps",
320: "Ice cream truck, ice cream van",
321: "Bus",
322: "Emergency vehicle",
323: "Police car (siren)",
324: "Ambulance (siren)",
325: "Fire engine, fire truck (siren)",
326: "Motorcycle",
327: "Traffic noise, roadway noise",
328: "Rail transport",
329: "Train",
330: "Train whistle",
331: "Train horn",
332: "Railroad car, train wagon",
333: "Train wheels squealing",
334: "Subway, metro, underground",
335: "Aircraft",
336: "Aircraft engine",
337: "Jet engine",
338: "Propeller, airscrew",
339: "Helicopter",
340: "Fixed-wing aircraft, airplane",
341: "Bicycle",
342: "Skateboard",
343: "Engine",
344: "Light engine (high frequency)",
345: "Dental drill, dentist's drill",
346: "Lawn mower",
347: "Chainsaw",
348: "Medium engine (mid frequency)",
349: "Heavy engine (low frequency)",
350: "Engine knocking",
351: "Engine starting",
352: "Idling",
353: "Accelerating, revving, vroom",
354: "Door",
355: "Doorbell",
356: "Ding-dong",
357: "Sliding door",
358: "Slam",
359: "Knock",
360: "Tap",
361: "Squeak",
362: "Cupboard open or close",
363: "Drawer open or close",
364: "Dishes, pots, and pans",
365: "Cutlery, silverware",
366: "Chopping (food)",
367: "Frying (food)",
368: "Microwave oven",
369: "Blender",
370: "Water tap, faucet",
371: "Sink (filling or washing)",
372: "Bathtub (filling or washing)",
373: "Hair dryer",
374: "Toilet flush",
375: "Toothbrush",
376: "Electric toothbrush",
377: "Vacuum cleaner",
378: "Zipper (clothing)",
379: "Keys jangling",
380: "Coin (dropping)",
381: "Scissors",
382: "Electric shaver, electric razor",
383: "Shuffling cards",
384: "Typing",
385: "Typewriter",
386: "Computer keyboard",
387: "Writing",
388: "Alarm",
389: "Telephone",
390: "Telephone bell ringing",
391: "Ringtone",
392: "Telephone dialing, DTMF",
393: "Dial tone",
394: "Busy signal",
395: "Alarm clock",
396: "Siren",
397: "Civil defense siren",
398: "Buzzer",
399: "Smoke detector, smoke alarm",
400: "Fire alarm",
401: "Foghorn",
402: "Whistle",
403: "Steam whistle",
404: "Mechanisms",
405: "Ratchet, pawl",
406: "Clock",
407: "Tick",
408: "Tick-tock",
409: "Gears",
410: "Pulleys",
411: "Sewing machine",
412: "Mechanical fan",
413: "Air conditioning",
414: "Cash register",
415: "Printer",
416: "Camera",
417: "Single-lens reflex camera",
418: "Tools",
419: "Hammer",
420: "Jackhammer",
421: "Sawing",
422: "Filing (rasp)",
423: "Sanding",
424: "Power tool",
425: "Drill",
426: "Explosion",
427: "Gunshot, gunfire",
428: "Machine gun",
429: "Fusillade",
430: "Artillery fire",
431: "Cap gun",
432: "Fireworks",
433: "Firecracker",
434: "Burst, pop",
435: "Eruption",
436: "Boom",
437: "Wood",
438: "Chop",
439: "Splinter",
440: "Crack",
441: "Glass",
442: "Chink, clink",
443: "Shatter",
444: "Liquid",
445: "Splash, splatter",
446: "Slosh",
447: "Squish",
448: "Drip",
449: "Pour",
450: "Trickle, dribble",
451: "Gush",
452: "Fill (with liquid)",
453: "Spray",
454: "Pump (liquid)",
455: "Stir",
456: "Boiling",
457: "Sonar",
458: "Arrow",
459: "Whoosh, swoosh, swish",
460: "Thump, thud",
461: "Thunk",
462: "Electronic tuner",
463: "Effects unit",
464: "Chorus effect",
465: "Basketball bounce",
466: "Bang",
467: "Slap, smack",
468: "Whack, thwack",
469: "Smash, crash",
470: "Breaking",
471: "Bouncing",
472: "Whip",
473: "Flap",
474: "Scratch",
475: "Scrape",
476: "Rub",
477: "Roll",
478: "Crushing",
479: "Crumpling, crinkling",
480: "Tearing",
481: "Beep, bleep",
482: "Ping",
483: "Ding",
484: "Clang",
485: "Squeal",
486: "Creak",
487: "Rustle",
488: "Whir",
489: "Clatter",
490: "Sizzle",
491: "Clicking",
492: "Clickety-clack",
493: "Rumble",
494: "Plop",
495: "Jingle, tinkle",
496: "Hum",
497: "Zing",
498: "Boing",
499: "Crunch",
500: "Silence",
501: "Sine wave",
502: "Harmonic",
503: "Chirp tone",
504: "Sound effect",
505: "Pulse",
506: "Inside, small room",
507: "Inside, large room or hall",
508: "Inside, public space",
509: "Outside, urban or manmade",
510: "Outside, rural or natural",
511: "Reverberation",
512: "Echo",
513: "Noise",
514: "Environmental noise",
515: "Static",
516: "Mains hum",
517: "Distortion",
518: "Sidetone",
519: "Cacophony",
520: "White noise",
521: "Pink noise",
522: "Throbbing",
523: "Vibration",
524: "Television",
525: "Radio",
526: "Field recording",
}
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.close()
def close(self):
if self.vggish_sess:
self.vggish_sess.close()
def start(self):
pass
def stop(self):
pass
def process(self, audio_data):
# convert input data to vggish embedding
embeddings = self._generate_embeddings(audio_data)
# pass embedding to model
label_weights = self._make_prediction(embeddings)
# output prediction
self._announce_prediction(label_weights)
def _generate_embeddings(self, audio_data):
examples_batch = vggish_input.waveform_to_examples(audio_data/32768.0, self.audio_properties["sample_rate"])
[embedding_batch] = self.vgg_sess.run(
[self.embedding_tensor],
feed_dict={self.features_tensor: examples_batch}
)
postprocessed_batch = self.pproc.postprocess(embedding_batch)
embeddings = postprocessed_batch
return embeddings
def _make_prediction(self, embedding):
data = self._resize_input(embedding, 0, 300)
data = np.expand_dims(data, 0)
num_frames = np.expand_dims(1, 0)
predictions, = self.model_sess.run(
[self.predictions_tensor],
feed_dict={
self.input_batch_raw_tensor: data,
self.num_frames_tensor: num_frames
}
)
return predictions[0]
def _resize_input(self, data, axis, new_size):
shape = list(data.shape)
pad_shape = shape[:]
pad_shape[axis] = np.maximum(0, new_size - shape[axis])
shape[axis] = np.minimum(shape[axis], new_size)
shape = np.stack(shape)
slices = [slice(0, s) for s in shape]
resized = np.concatenate([
data[slices],
np.zeros(np.stack(pad_shape))
], axis)
# Update shape.
new_shape = list(data.shape)
new_shape[axis] = new_size
resized.reshape(new_shape)
return resized
def _announce_prediction(self, label_weights):
# modify this to hook into whatever notification mechanism is applicable for the scenario
top_indices = np.argpartition(label_weights, -5)[-5:]
top_indices = sorted(top_indices, key=lambda a: label_weights[a], reverse=True)
dog_noise = False
for i in top_indices:
if (i >= 75 and i <= 80):
log(str(i) + ": " + self.labels[i] + " - " + str(label_weights[i]))
dog_noise = True
if not dog_noise:
log("no dog noises")
if __name__ == "__main__":
args = sys.argv
deviceIndex = int(args[1]) if len(args) == 2 and args[1].isdigit() else None
device = ListenerDevice(deviceIndex)
if len(args) == 2 and args[1] == "list":
device.list_devices()
else:
# uncomment this listener to save the audio to a wav file as you speak, good for testing that it's working
#device.add_listener(ListenerWavSave(device.audio_properties, "recording.wav"))
# this is the listener which does the dog mood prediction
device.add_listener(ListenerDogMoodDetector(device.audio_properties))
device.listen(deviceIndex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment