Created
November 8, 2019 20:43
-
-
Save NickLarsen/1826b3aad5f72e8f6f84d62e138fab84 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# inspiration from https://github.com/devicehive/devicehive-audio-analysis | |
import sys | |
import pyaudio | |
import numpy as np | |
import wave | |
import os | |
import tensorflow as tf | |
import vggish_input | |
import vggish_params | |
import vggish_postprocess | |
import vggish_slim | |
from datetime import datetime | |
def log(message): | |
time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
print(time_str + ": " + message) | |
class ListenerDevice: | |
def __init__(self, device_index): | |
self.audio = pyaudio.PyAudio() | |
self.listeners = [] | |
if device_index is None: | |
device_info = self.audio.get_default_input_device_info() | |
else: | |
device_info = self.audio.get_device_info_by_index(device_index) | |
self.audio_properties = { | |
"index": device_info["index"], | |
"name": device_info["name"], | |
"sample_width": self.audio.get_sample_size(pyaudio.paInt16), | |
"sample_rate": int(device_info["defaultSampleRate"]), | |
"channels": 1, | |
} | |
print("selected audio device: ", self.audio_properties) | |
def __del__(self): | |
self.audio.terminate() | |
def list_devices(self): | |
audio = self.audio | |
default_index = audio.get_default_input_device_info()["index"] | |
inputs = [ | |
audio.get_device_info_by_index(i) | |
for i in range(audio.get_device_count()) | |
if audio.get_device_info_by_index(i).get('maxInputChannels') > 0 | |
] | |
for device in inputs: | |
if (device["index"] == default_index): | |
print(str(device["index"]) + " (default): " + device["name"]) | |
else: | |
print(str(device["index"]) + ": " + device["name"]) | |
def add_listener(self, listener): | |
self.listeners.append(listener) | |
def listen(self, device_index = None): | |
record_stream = self.audio.open( | |
format=pyaudio.paInt16, | |
channels=self.audio_properties["channels"], | |
rate=self.audio_properties["sample_rate"], | |
frames_per_buffer=self.audio_properties["sample_rate"], | |
input=True, | |
stream_callback=self._forward_audio_data, | |
input_device_index=self.audio_properties["index"], | |
) | |
for listener in self.listeners: | |
listener.start() | |
record_stream.start_stream() | |
input("Press Enter to stop recording...") | |
record_stream.stop_stream() | |
record_stream.close() | |
for listener in self.listeners: | |
listener.stop() | |
def _forward_audio_data(self, in_data, frame_count, time_info, status): | |
data = np.frombuffer(in_data, dtype=np.int16, count=frame_count) | |
for listener in self.listeners: | |
listener.process(data) | |
return (in_data, pyaudio.paContinue) | |
class ListenerWavSave: | |
def __init__(self, audio_properties, filename): | |
self.audio_properties = audio_properties | |
self.filename = filename | |
def start(self): | |
output_file = wave.open(self.filename, "wb") | |
output_file.setnchannels(self.audio_properties["channels"]) | |
output_file.setsampwidth(self.audio_properties["sample_width"]) | |
output_file.setframerate(self.audio_properties["sample_rate"]) | |
self.output_file = output_file | |
def stop(self): | |
self.output_file.close() | |
def process(self, audio_data): | |
self.output_file.writeframes(audio_data) | |
class ListenerDogMoodDetector: | |
def __init__(self, audio_properties, model_path = ""): | |
self.audio_properties = audio_properties | |
self.pproc = vggish_postprocess.Postprocessor(os.path.join(model_path, "vggish_pca_params.npz")) | |
#vggish | |
graph_vgg = tf.Graph() | |
with graph_vgg.as_default(): | |
self.vgg_sess = tf.Session() | |
vggish_slim.define_vggish_slim(training=False) | |
vggish_slim.load_vggish_slim_checkpoint(self.vgg_sess, os.path.join(model_path, "vggish_model.ckpt")) | |
self.features_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) | |
self.embedding_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) | |
graph_yt = tf.Graph() | |
with graph_yt.as_default(): | |
self.model_sess = tf.Session() | |
model_file = os.path.join(model_path, "model.ckpt") | |
saver = tf.train.import_meta_graph(model_file + ".meta", clear_devices=True, import_scope='m2') | |
saver.restore(self.model_sess, model_file) | |
init_op_list = [] | |
variables = tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES) | |
for variable in list(variables): | |
if "train_input" in variable.name: | |
init_op_list.append(tf.assign(variable, 1)) | |
variables.remove(variable) | |
init_op_list.append(tf.variables_initializer(variables)) | |
self.model_sess.run(init_op_list) | |
self.input_batch_raw_tensor = self.model_sess.graph.get_collection("input_batch_raw")[0] | |
self.num_frames_tensor = self.model_sess.graph.get_collection("num_frames")[0] | |
self.predictions_tensor = self.model_sess.graph.get_collection("predictions")[0] | |
self.labels = { | |
0: "Speech", | |
1: "Male speech, man speaking", | |
2: "Female speech, woman speaking", | |
3: "Child speech, kid speaking", | |
4: "Conversation", | |
5: "Narration, monologue", | |
6: "Babbling", | |
7: "Speech synthesizer", | |
8: "Shout", | |
9: "Bellow", | |
10: "Whoop", | |
11: "Yell", | |
12: "Battle cry", | |
13: "Children shouting", | |
14: "Screaming", | |
15: "Whispering", | |
16: "Laughter", | |
17: "Baby laughter", | |
18: "Giggle", | |
19: "Snicker", | |
20: "Belly laugh", | |
21: "Chuckle, chortle", | |
22: "Crying, sobbing", | |
23: "Baby cry, infant cry", | |
24: "Whimper", | |
25: "Wail, moan", | |
26: "Sigh", | |
27: "Singing", | |
28: "Choir", | |
29: "Yodeling", | |
30: "Chant", | |
31: "Mantra", | |
32: "Male singing", | |
33: "Female singing", | |
34: "Child singing", | |
35: "Synthetic singing", | |
36: "Rapping", | |
37: "Humming", | |
38: "Groan", | |
39: "Grunt", | |
40: "Whistling", | |
41: "Breathing", | |
42: "Wheeze", | |
43: "Snoring", | |
44: "Gasp", | |
45: "Pant", | |
46: "Snort", | |
47: "Cough", | |
48: "Throat clearing", | |
49: "Sneeze", | |
50: "Sniff", | |
51: "Run", | |
52: "Shuffle", | |
53: "Walk, footsteps", | |
54: "Chewing, mastication", | |
55: "Biting", | |
56: "Gargling", | |
57: "Stomach rumble", | |
58: "Burping, eructation", | |
59: "Hiccup", | |
60: "Fart", | |
61: "Hands", | |
62: "Finger snapping", | |
63: "Clapping", | |
64: "Heart sounds, heartbeat", | |
65: "Heart murmur", | |
66: "Cheering", | |
67: "Applause", | |
68: "Chatter", | |
69: "Crowd", | |
70: "Hubbub, speech noise, speech babble", | |
71: "Children playing", | |
72: "Animal", | |
73: "Domestic animals, pets", | |
74: "Dog", | |
75: "Bark", | |
76: "Yip", | |
77: "Howl", | |
78: "Bow-wow", | |
79: "Growling", | |
80: "Whimper (dog)", | |
81: "Cat", | |
82: "Purr", | |
83: "Meow", | |
84: "Hiss", | |
85: "Caterwaul", | |
86: "Livestock, farm animals, working animals", | |
87: "Horse", | |
88: "Clip-clop", | |
89: "Neigh, whinny", | |
90: "Cattle, bovinae", | |
91: "Moo", | |
92: "Cowbell", | |
93: "Pig", | |
94: "Oink", | |
95: "Goat", | |
96: "Bleat", | |
97: "Sheep", | |
98: "Fowl", | |
99: "Chicken, rooster", | |
100: "Cluck", | |
101: "Crowing, cock-a-doodle-doo", | |
102: "Turkey", | |
103: "Gobble", | |
104: "Duck", | |
105: "Quack", | |
106: "Goose", | |
107: "Honk", | |
108: "Wild animals", | |
109: "Roaring cats (lions, tigers)", | |
110: "Roar", | |
111: "Bird", | |
112: "Bird vocalization, bird call, bird song", | |
113: "Chirp, tweet", | |
114: "Squawk", | |
115: "Pigeon, dove", | |
116: "Coo", | |
117: "Crow", | |
118: "Caw", | |
119: "Owl", | |
120: "Hoot", | |
121: "Bird flight, flapping wings", | |
122: "Canidae, dogs, wolves", | |
123: "Rodents, rats, mice", | |
124: "Mouse", | |
125: "Patter", | |
126: "Insect", | |
127: "Cricket", | |
128: "Mosquito", | |
129: "Fly, housefly", | |
130: "Buzz", | |
131: "Bee, wasp, etc.", | |
132: "Frog", | |
133: "Croak", | |
134: "Snake", | |
135: "Rattle", | |
136: "Whale vocalization", | |
137: "Music", | |
138: "Musical instrument", | |
139: "Plucked string instrument", | |
140: "Guitar", | |
141: "Electric guitar", | |
142: "Bass guitar", | |
143: "Acoustic guitar", | |
144: "Steel guitar, slide guitar", | |
145: "Tapping (guitar technique)", | |
146: "Strum", | |
147: "Banjo", | |
148: "Sitar", | |
149: "Mandolin", | |
150: "Zither", | |
151: "Ukulele", | |
152: "Keyboard (musical)", | |
153: "Piano", | |
154: "Electric piano", | |
155: "Organ", | |
156: "Electronic organ", | |
157: "Hammond organ", | |
158: "Synthesizer", | |
159: "Sampler", | |
160: "Harpsichord", | |
161: "Percussion", | |
162: "Drum kit", | |
163: "Drum machine", | |
164: "Drum", | |
165: "Snare drum", | |
166: "Rimshot", | |
167: "Drum roll", | |
168: "Bass drum", | |
169: "Timpani", | |
170: "Tabla", | |
171: "Cymbal", | |
172: "Hi-hat", | |
173: "Wood block", | |
174: "Tambourine", | |
175: "Rattle (instrument)", | |
176: "Maraca", | |
177: "Gong", | |
178: "Tubular bells", | |
179: "Mallet percussion", | |
180: "Marimba, xylophone", | |
181: "Glockenspiel", | |
182: "Vibraphone", | |
183: "Steelpan", | |
184: "Orchestra", | |
185: "Brass instrument", | |
186: "French horn", | |
187: "Trumpet", | |
188: "Trombone", | |
189: "Bowed string instrument", | |
190: "String section", | |
191: "Violin, fiddle", | |
192: "Pizzicato", | |
193: "Cello", | |
194: "Double bass", | |
195: "Wind instrument, woodwind instrument", | |
196: "Flute", | |
197: "Saxophone", | |
198: "Clarinet", | |
199: "Harp", | |
200: "Bell", | |
201: "Church bell", | |
202: "Jingle bell", | |
203: "Bicycle bell", | |
204: "Tuning fork", | |
205: "Chime", | |
206: "Wind chime", | |
207: "Change ringing (campanology)", | |
208: "Harmonica", | |
209: "Accordion", | |
210: "Bagpipes", | |
211: "Didgeridoo", | |
212: "Shofar", | |
213: "Theremin", | |
214: "Singing bowl", | |
215: "Scratching (performance technique)", | |
216: "Pop music", | |
217: "Hip hop music", | |
218: "Beatboxing", | |
219: "Rock music", | |
220: "Heavy metal", | |
221: "Punk rock", | |
222: "Grunge", | |
223: "Progressive rock", | |
224: "Rock and roll", | |
225: "Psychedelic rock", | |
226: "Rhythm and blues", | |
227: "Soul music", | |
228: "Reggae", | |
229: "Country", | |
230: "Swing music", | |
231: "Bluegrass", | |
232: "Funk", | |
233: "Folk music", | |
234: "Middle Eastern music", | |
235: "Jazz", | |
236: "Disco", | |
237: "Classical music", | |
238: "Opera", | |
239: "Electronic music", | |
240: "House music", | |
241: "Techno", | |
242: "Dubstep", | |
243: "Drum and bass", | |
244: "Electronica", | |
245: "Electronic dance music", | |
246: "Ambient music", | |
247: "Trance music", | |
248: "Music of Latin America", | |
249: "Salsa music", | |
250: "Flamenco", | |
251: "Blues", | |
252: "Music for children", | |
253: "New-age music", | |
254: "Vocal music", | |
255: "A capella", | |
256: "Music of Africa", | |
257: "Afrobeat", | |
258: "Christian music", | |
259: "Gospel music", | |
260: "Music of Asia", | |
261: "Carnatic music", | |
262: "Music of Bollywood", | |
263: "Ska", | |
264: "Traditional music", | |
265: "Independent music", | |
266: "Song", | |
267: "Background music", | |
268: "Theme music", | |
269: "Jingle (music)", | |
270: "Soundtrack music", | |
271: "Lullaby", | |
272: "Video game music", | |
273: "Christmas music", | |
274: "Dance music", | |
275: "Wedding music", | |
276: "Happy music", | |
277: "Funny music", | |
278: "Sad music", | |
279: "Tender music", | |
280: "Exciting music", | |
281: "Angry music", | |
282: "Scary music", | |
283: "Wind", | |
284: "Rustling leaves", | |
285: "Wind noise (microphone)", | |
286: "Thunderstorm", | |
287: "Thunder", | |
288: "Water", | |
289: "Rain", | |
290: "Raindrop", | |
291: "Rain on surface", | |
292: "Stream", | |
293: "Waterfall", | |
294: "Ocean", | |
295: "Waves, surf", | |
296: "Steam", | |
297: "Gurgling", | |
298: "Fire", | |
299: "Crackle", | |
300: "Vehicle", | |
301: "Boat, Water vehicle", | |
302: "Sailboat, sailing ship", | |
303: "Rowboat, canoe, kayak", | |
304: "Motorboat, speedboat", | |
305: "Ship", | |
306: "Motor vehicle (road)", | |
307: "Car", | |
308: "Vehicle horn, car horn, honking", | |
309: "Toot", | |
310: "Car alarm", | |
311: "Power windows, electric windows", | |
312: "Skidding", | |
313: "Tire squeal", | |
314: "Car passing by", | |
315: "Race car, auto racing", | |
316: "Truck", | |
317: "Air brake", | |
318: "Air horn, truck horn", | |
319: "Reversing beeps", | |
320: "Ice cream truck, ice cream van", | |
321: "Bus", | |
322: "Emergency vehicle", | |
323: "Police car (siren)", | |
324: "Ambulance (siren)", | |
325: "Fire engine, fire truck (siren)", | |
326: "Motorcycle", | |
327: "Traffic noise, roadway noise", | |
328: "Rail transport", | |
329: "Train", | |
330: "Train whistle", | |
331: "Train horn", | |
332: "Railroad car, train wagon", | |
333: "Train wheels squealing", | |
334: "Subway, metro, underground", | |
335: "Aircraft", | |
336: "Aircraft engine", | |
337: "Jet engine", | |
338: "Propeller, airscrew", | |
339: "Helicopter", | |
340: "Fixed-wing aircraft, airplane", | |
341: "Bicycle", | |
342: "Skateboard", | |
343: "Engine", | |
344: "Light engine (high frequency)", | |
345: "Dental drill, dentist's drill", | |
346: "Lawn mower", | |
347: "Chainsaw", | |
348: "Medium engine (mid frequency)", | |
349: "Heavy engine (low frequency)", | |
350: "Engine knocking", | |
351: "Engine starting", | |
352: "Idling", | |
353: "Accelerating, revving, vroom", | |
354: "Door", | |
355: "Doorbell", | |
356: "Ding-dong", | |
357: "Sliding door", | |
358: "Slam", | |
359: "Knock", | |
360: "Tap", | |
361: "Squeak", | |
362: "Cupboard open or close", | |
363: "Drawer open or close", | |
364: "Dishes, pots, and pans", | |
365: "Cutlery, silverware", | |
366: "Chopping (food)", | |
367: "Frying (food)", | |
368: "Microwave oven", | |
369: "Blender", | |
370: "Water tap, faucet", | |
371: "Sink (filling or washing)", | |
372: "Bathtub (filling or washing)", | |
373: "Hair dryer", | |
374: "Toilet flush", | |
375: "Toothbrush", | |
376: "Electric toothbrush", | |
377: "Vacuum cleaner", | |
378: "Zipper (clothing)", | |
379: "Keys jangling", | |
380: "Coin (dropping)", | |
381: "Scissors", | |
382: "Electric shaver, electric razor", | |
383: "Shuffling cards", | |
384: "Typing", | |
385: "Typewriter", | |
386: "Computer keyboard", | |
387: "Writing", | |
388: "Alarm", | |
389: "Telephone", | |
390: "Telephone bell ringing", | |
391: "Ringtone", | |
392: "Telephone dialing, DTMF", | |
393: "Dial tone", | |
394: "Busy signal", | |
395: "Alarm clock", | |
396: "Siren", | |
397: "Civil defense siren", | |
398: "Buzzer", | |
399: "Smoke detector, smoke alarm", | |
400: "Fire alarm", | |
401: "Foghorn", | |
402: "Whistle", | |
403: "Steam whistle", | |
404: "Mechanisms", | |
405: "Ratchet, pawl", | |
406: "Clock", | |
407: "Tick", | |
408: "Tick-tock", | |
409: "Gears", | |
410: "Pulleys", | |
411: "Sewing machine", | |
412: "Mechanical fan", | |
413: "Air conditioning", | |
414: "Cash register", | |
415: "Printer", | |
416: "Camera", | |
417: "Single-lens reflex camera", | |
418: "Tools", | |
419: "Hammer", | |
420: "Jackhammer", | |
421: "Sawing", | |
422: "Filing (rasp)", | |
423: "Sanding", | |
424: "Power tool", | |
425: "Drill", | |
426: "Explosion", | |
427: "Gunshot, gunfire", | |
428: "Machine gun", | |
429: "Fusillade", | |
430: "Artillery fire", | |
431: "Cap gun", | |
432: "Fireworks", | |
433: "Firecracker", | |
434: "Burst, pop", | |
435: "Eruption", | |
436: "Boom", | |
437: "Wood", | |
438: "Chop", | |
439: "Splinter", | |
440: "Crack", | |
441: "Glass", | |
442: "Chink, clink", | |
443: "Shatter", | |
444: "Liquid", | |
445: "Splash, splatter", | |
446: "Slosh", | |
447: "Squish", | |
448: "Drip", | |
449: "Pour", | |
450: "Trickle, dribble", | |
451: "Gush", | |
452: "Fill (with liquid)", | |
453: "Spray", | |
454: "Pump (liquid)", | |
455: "Stir", | |
456: "Boiling", | |
457: "Sonar", | |
458: "Arrow", | |
459: "Whoosh, swoosh, swish", | |
460: "Thump, thud", | |
461: "Thunk", | |
462: "Electronic tuner", | |
463: "Effects unit", | |
464: "Chorus effect", | |
465: "Basketball bounce", | |
466: "Bang", | |
467: "Slap, smack", | |
468: "Whack, thwack", | |
469: "Smash, crash", | |
470: "Breaking", | |
471: "Bouncing", | |
472: "Whip", | |
473: "Flap", | |
474: "Scratch", | |
475: "Scrape", | |
476: "Rub", | |
477: "Roll", | |
478: "Crushing", | |
479: "Crumpling, crinkling", | |
480: "Tearing", | |
481: "Beep, bleep", | |
482: "Ping", | |
483: "Ding", | |
484: "Clang", | |
485: "Squeal", | |
486: "Creak", | |
487: "Rustle", | |
488: "Whir", | |
489: "Clatter", | |
490: "Sizzle", | |
491: "Clicking", | |
492: "Clickety-clack", | |
493: "Rumble", | |
494: "Plop", | |
495: "Jingle, tinkle", | |
496: "Hum", | |
497: "Zing", | |
498: "Boing", | |
499: "Crunch", | |
500: "Silence", | |
501: "Sine wave", | |
502: "Harmonic", | |
503: "Chirp tone", | |
504: "Sound effect", | |
505: "Pulse", | |
506: "Inside, small room", | |
507: "Inside, large room or hall", | |
508: "Inside, public space", | |
509: "Outside, urban or manmade", | |
510: "Outside, rural or natural", | |
511: "Reverberation", | |
512: "Echo", | |
513: "Noise", | |
514: "Environmental noise", | |
515: "Static", | |
516: "Mains hum", | |
517: "Distortion", | |
518: "Sidetone", | |
519: "Cacophony", | |
520: "White noise", | |
521: "Pink noise", | |
522: "Throbbing", | |
523: "Vibration", | |
524: "Television", | |
525: "Radio", | |
526: "Field recording", | |
} | |
def __enter__(self): | |
return self | |
def __exit__(self, *args, **kwargs): | |
self.close() | |
def close(self): | |
if self.vggish_sess: | |
self.vggish_sess.close() | |
def start(self): | |
pass | |
def stop(self): | |
pass | |
def process(self, audio_data): | |
# convert input data to vggish embedding | |
embeddings = self._generate_embeddings(audio_data) | |
# pass embedding to model | |
label_weights = self._make_prediction(embeddings) | |
# output prediction | |
self._announce_prediction(label_weights) | |
def _generate_embeddings(self, audio_data): | |
examples_batch = vggish_input.waveform_to_examples(audio_data/32768.0, self.audio_properties["sample_rate"]) | |
[embedding_batch] = self.vgg_sess.run( | |
[self.embedding_tensor], | |
feed_dict={self.features_tensor: examples_batch} | |
) | |
postprocessed_batch = self.pproc.postprocess(embedding_batch) | |
embeddings = postprocessed_batch | |
return embeddings | |
def _make_prediction(self, embedding): | |
data = self._resize_input(embedding, 0, 300) | |
data = np.expand_dims(data, 0) | |
num_frames = np.expand_dims(1, 0) | |
predictions, = self.model_sess.run( | |
[self.predictions_tensor], | |
feed_dict={ | |
self.input_batch_raw_tensor: data, | |
self.num_frames_tensor: num_frames | |
} | |
) | |
return predictions[0] | |
def _resize_input(self, data, axis, new_size): | |
shape = list(data.shape) | |
pad_shape = shape[:] | |
pad_shape[axis] = np.maximum(0, new_size - shape[axis]) | |
shape[axis] = np.minimum(shape[axis], new_size) | |
shape = np.stack(shape) | |
slices = [slice(0, s) for s in shape] | |
resized = np.concatenate([ | |
data[slices], | |
np.zeros(np.stack(pad_shape)) | |
], axis) | |
# Update shape. | |
new_shape = list(data.shape) | |
new_shape[axis] = new_size | |
resized.reshape(new_shape) | |
return resized | |
def _announce_prediction(self, label_weights): | |
# modify this to hook into whatever notification mechanism is applicable for the scenario | |
top_indices = np.argpartition(label_weights, -5)[-5:] | |
top_indices = sorted(top_indices, key=lambda a: label_weights[a], reverse=True) | |
dog_noise = False | |
for i in top_indices: | |
if (i >= 75 and i <= 80): | |
log(str(i) + ": " + self.labels[i] + " - " + str(label_weights[i])) | |
dog_noise = True | |
if not dog_noise: | |
log("no dog noises") | |
if __name__ == "__main__": | |
args = sys.argv | |
deviceIndex = int(args[1]) if len(args) == 2 and args[1].isdigit() else None | |
device = ListenerDevice(deviceIndex) | |
if len(args) == 2 and args[1] == "list": | |
device.list_devices() | |
else: | |
# uncomment this listener to save the audio to a wav file as you speak, good for testing that it's working | |
#device.add_listener(ListenerWavSave(device.audio_properties, "recording.wav")) | |
# this is the listener which does the dog mood prediction | |
device.add_listener(ListenerDogMoodDetector(device.audio_properties)) | |
device.listen(deviceIndex) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment