NickLarsen · November 8, 2019 20:43
diff --git a/dog-bark-detector.py b/dog-bark-detector.py
 # inspiration from https://github.com/devicehive/devicehive-audio-analysis

 import sys
 import pyaudio
 import numpy as np
 import wave
 import os
 import tensorflow as tf

 import vggish_input
 import vggish_params
 import vggish_postprocess
 import vggish_slim


 from datetime import datetime
 def log(message):
    time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(time_str + ": " + message)


 class ListenerDevice:
    def __init__(self, device_index):
        self.audio = pyaudio.PyAudio()
        self.listeners = []

        if device_index is None:
            device_info = self.audio.get_default_input_device_info()
        else:
            device_info = self.audio.get_device_info_by_index(device_index)
        
        self.audio_properties = {
            "index": device_info["index"],
            "name": device_info["name"],
            "sample_width": self.audio.get_sample_size(pyaudio.paInt16),
            "sample_rate": int(device_info["defaultSampleRate"]),
            "channels": 1,
        }
        print("selected audio device: ", self.audio_properties)

    def __del__(self):
        self.audio.terminate()

    def list_devices(self):
        audio = self.audio
        default_index = audio.get_default_input_device_info()["index"]
        inputs = [
            audio.get_device_info_by_index(i) 
            for i in range(audio.get_device_count())
            if audio.get_device_info_by_index(i).get('maxInputChannels') > 0
        ]
        for device in inputs:
            if (device["index"] == default_index):
                print(str(device["index"]) + " (default): " + device["name"])
            else:
                print(str(device["index"]) + ": " + device["name"])

    def add_listener(self, listener):
        self.listeners.append(listener)

    def listen(self, device_index = None):
        record_stream = self.audio.open(
            format=pyaudio.paInt16,
            channels=self.audio_properties["channels"],
            rate=self.audio_properties["sample_rate"],
            frames_per_buffer=self.audio_properties["sample_rate"],
            input=True,
            stream_callback=self._forward_audio_data,
            input_device_index=self.audio_properties["index"],
        )

        for listener in self.listeners:
            listener.start()

        record_stream.start_stream()

        input("Press Enter to stop recording...")

        record_stream.stop_stream()
        record_stream.close()
        for listener in self.listeners:
            listener.stop()


    def _forward_audio_data(self, in_data, frame_count, time_info, status):
        data = np.frombuffer(in_data, dtype=np.int16, count=frame_count)
        for listener in self.listeners:
            listener.process(data)
        return (in_data, pyaudio.paContinue)



 class ListenerWavSave:
    def __init__(self, audio_properties, filename):
        self.audio_properties = audio_properties
        self.filename = filename

    def start(self):
        output_file = wave.open(self.filename, "wb")
        output_file.setnchannels(self.audio_properties["channels"])
        output_file.setsampwidth(self.audio_properties["sample_width"])
        output_file.setframerate(self.audio_properties["sample_rate"])
        self.output_file = output_file

    def stop(self):
        self.output_file.close()

    def process(self, audio_data):
        self.output_file.writeframes(audio_data)



 class ListenerDogMoodDetector:
    def __init__(self, audio_properties, model_path = ""):
        self.audio_properties = audio_properties
        self.pproc = vggish_postprocess.Postprocessor(os.path.join(model_path, "vggish_pca_params.npz"))
        #vggish
        graph_vgg = tf.Graph()
        with graph_vgg.as_default():
            self.vgg_sess = tf.Session()
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(self.vgg_sess, os.path.join(model_path, "vggish_model.ckpt"))
            self.features_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
            self.embedding_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

        graph_yt = tf.Graph()
        with graph_yt.as_default():
            self.model_sess = tf.Session()
            model_file = os.path.join(model_path, "model.ckpt")
            saver = tf.train.import_meta_graph(model_file + ".meta", clear_devices=True, import_scope='m2')
            saver.restore(self.model_sess, model_file)
            init_op_list = []
            variables = tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES)
            for variable in list(variables):
                if "train_input" in variable.name:
                    init_op_list.append(tf.assign(variable, 1))
                    variables.remove(variable)
            init_op_list.append(tf.variables_initializer(variables))
            self.model_sess.run(init_op_list)
            self.input_batch_raw_tensor = self.model_sess.graph.get_collection("input_batch_raw")[0]
            self.num_frames_tensor = self.model_sess.graph.get_collection("num_frames")[0]
            self.predictions_tensor = self.model_sess.graph.get_collection("predictions")[0]

        self.labels = {
            0: "Speech",
            1: "Male speech, man speaking",
            2: "Female speech, woman speaking",
            3: "Child speech, kid speaking",
            4: "Conversation",
            5: "Narration, monologue",
            6: "Babbling",
            7: "Speech synthesizer",
            8: "Shout",
            9: "Bellow",
            10: "Whoop",
            11: "Yell",
            12: "Battle cry",
            13: "Children shouting",
            14: "Screaming",
            15: "Whispering",
            16: "Laughter",
            17: "Baby laughter",
            18: "Giggle",
            19: "Snicker",
            20: "Belly laugh",
            21: "Chuckle, chortle",
            22: "Crying, sobbing",
            23: "Baby cry, infant cry",
            24: "Whimper",
            25: "Wail, moan",
            26: "Sigh",
            27: "Singing",
            28: "Choir",
            29: "Yodeling",
            30: "Chant",
            31: "Mantra",
            32: "Male singing",
            33: "Female singing",
            34: "Child singing",
            35: "Synthetic singing",
            36: "Rapping",
            37: "Humming",
            38: "Groan",
            39: "Grunt",
            40: "Whistling",
            41: "Breathing",
            42: "Wheeze",
            43: "Snoring",
            44: "Gasp",
            45: "Pant",
            46: "Snort",
            47: "Cough",
            48: "Throat clearing",
            49: "Sneeze",
            50: "Sniff",
            51: "Run",
            52: "Shuffle",
            53: "Walk, footsteps",
            54: "Chewing, mastication",
            55: "Biting",
            56: "Gargling",
            57: "Stomach rumble",
            58: "Burping, eructation",
            59: "Hiccup",
            60: "Fart",
            61: "Hands",
            62: "Finger snapping",
            63: "Clapping",
            64: "Heart sounds, heartbeat",
            65: "Heart murmur",
            66: "Cheering",
            67: "Applause",
            68: "Chatter",
            69: "Crowd",
            70: "Hubbub, speech noise, speech babble",
            71: "Children playing",
            72: "Animal",
            73: "Domestic animals, pets",
            74: "Dog",
            75: "Bark",
            76: "Yip",
            77: "Howl",
            78: "Bow-wow",
            79: "Growling",
            80: "Whimper (dog)",
            81: "Cat",
            82: "Purr",
            83: "Meow",
            84: "Hiss",
            85: "Caterwaul",
            86: "Livestock, farm animals, working animals",
            87: "Horse",
            88: "Clip-clop",
            89: "Neigh, whinny",
            90: "Cattle, bovinae",
            91: "Moo",
            92: "Cowbell",
            93: "Pig",
            94: "Oink",
            95: "Goat",
            96: "Bleat",
            97: "Sheep",
            98: "Fowl",
            99: "Chicken, rooster",
            100: "Cluck",
            101: "Crowing, cock-a-doodle-doo",
            102: "Turkey",
            103: "Gobble",
            104: "Duck",
            105: "Quack",
            106: "Goose",
            107: "Honk",
            108: "Wild animals",
            109: "Roaring cats (lions, tigers)",
            110: "Roar",
            111: "Bird",
            112: "Bird vocalization, bird call, bird song",
            113: "Chirp, tweet",
            114: "Squawk",
            115: "Pigeon, dove",
            116: "Coo",
            117: "Crow",
            118: "Caw",
            119: "Owl",
            120: "Hoot",
            121: "Bird flight, flapping wings",
            122: "Canidae, dogs, wolves",
            123: "Rodents, rats, mice",
            124: "Mouse",
            125: "Patter",
            126: "Insect",
            127: "Cricket",
            128: "Mosquito",
            129: "Fly, housefly",
            130: "Buzz",
            131: "Bee, wasp, etc.",
            132: "Frog",
            133: "Croak",
            134: "Snake",
            135: "Rattle",
            136: "Whale vocalization",
            137: "Music",
            138: "Musical instrument",
            139: "Plucked string instrument",
            140: "Guitar",
            141: "Electric guitar",
            142: "Bass guitar",
            143: "Acoustic guitar",
            144: "Steel guitar, slide guitar",
            145: "Tapping (guitar technique)",
            146: "Strum",
            147: "Banjo",
            148: "Sitar",
            149: "Mandolin",
            150: "Zither",
            151: "Ukulele",
            152: "Keyboard (musical)",
            153: "Piano",
            154: "Electric piano",
            155: "Organ",
            156: "Electronic organ",
            157: "Hammond organ",
            158: "Synthesizer",
            159: "Sampler",
            160: "Harpsichord",
            161: "Percussion",
            162: "Drum kit",
            163: "Drum machine",
            164: "Drum",
            165: "Snare drum",
            166: "Rimshot",
            167: "Drum roll",
            168: "Bass drum",
            169: "Timpani",
            170: "Tabla",
            171: "Cymbal",
            172: "Hi-hat",
            173: "Wood block",
            174: "Tambourine",
            175: "Rattle (instrument)",
            176: "Maraca",
            177: "Gong",
            178: "Tubular bells",
            179: "Mallet percussion",
            180: "Marimba, xylophone",
            181: "Glockenspiel",
            182: "Vibraphone",
            183: "Steelpan",
            184: "Orchestra",
            185: "Brass instrument",
            186: "French horn",
            187: "Trumpet",
            188: "Trombone",
            189: "Bowed string instrument",
            190: "String section",
            191: "Violin, fiddle",
            192: "Pizzicato",
            193: "Cello",
            194: "Double bass",
            195: "Wind instrument, woodwind instrument",
            196: "Flute",
            197: "Saxophone",
            198: "Clarinet",
            199: "Harp",
            200: "Bell",
            201: "Church bell",
            202: "Jingle bell",
            203: "Bicycle bell",
            204: "Tuning fork",
            205: "Chime",
            206: "Wind chime",
            207: "Change ringing (campanology)",
            208: "Harmonica",
            209: "Accordion",
            210: "Bagpipes",
            211: "Didgeridoo",
            212: "Shofar",
            213: "Theremin",
            214: "Singing bowl",
            215: "Scratching (performance technique)",
            216: "Pop music",
            217: "Hip hop music",
            218: "Beatboxing",
            219: "Rock music",
            220: "Heavy metal",
            221: "Punk rock",
            222: "Grunge",
            223: "Progressive rock",
            224: "Rock and roll",
            225: "Psychedelic rock",
            226: "Rhythm and blues",
            227: "Soul music",
            228: "Reggae",
            229: "Country",
            230: "Swing music",
            231: "Bluegrass",
            232: "Funk",
            233: "Folk music",
            234: "Middle Eastern music",
            235: "Jazz",
            236: "Disco",
            237: "Classical music",
            238: "Opera",
            239: "Electronic music",
            240: "House music",
            241: "Techno",
            242: "Dubstep",
            243: "Drum and bass",
            244: "Electronica",
            245: "Electronic dance music",
            246: "Ambient music",
            247: "Trance music",
            248: "Music of Latin America",
            249: "Salsa music",
            250: "Flamenco",
            251: "Blues",
            252: "Music for children",
            253: "New-age music",
            254: "Vocal music",
            255: "A capella",
            256: "Music of Africa",
            257: "Afrobeat",
            258: "Christian music",
            259: "Gospel music",
            260: "Music of Asia",
            261: "Carnatic music",
            262: "Music of Bollywood",
            263: "Ska",
            264: "Traditional music",
            265: "Independent music",
            266: "Song",
            267: "Background music",
            268: "Theme music",
            269: "Jingle (music)",
            270: "Soundtrack music",
            271: "Lullaby",
            272: "Video game music",
            273: "Christmas music",
            274: "Dance music",
            275: "Wedding music",
            276: "Happy music",
            277: "Funny music",
            278: "Sad music",
            279: "Tender music",
            280: "Exciting music",
            281: "Angry music",
            282: "Scary music",
            283: "Wind",
            284: "Rustling leaves",
            285: "Wind noise (microphone)",
            286: "Thunderstorm",
            287: "Thunder",
            288: "Water",
            289: "Rain",
            290: "Raindrop",
            291: "Rain on surface",
            292: "Stream",
            293: "Waterfall",
            294: "Ocean",
            295: "Waves, surf",
            296: "Steam",
            297: "Gurgling",
            298: "Fire",
            299: "Crackle",
            300: "Vehicle",
            301: "Boat, Water vehicle",
            302: "Sailboat, sailing ship",
            303: "Rowboat, canoe, kayak",
            304: "Motorboat, speedboat",
            305: "Ship",
            306: "Motor vehicle (road)",
            307: "Car",
            308: "Vehicle horn, car horn, honking",
            309: "Toot",
            310: "Car alarm",
            311: "Power windows, electric windows",
            312: "Skidding",
            313: "Tire squeal",
            314: "Car passing by",
            315: "Race car, auto racing",
            316: "Truck",
            317: "Air brake",
            318: "Air horn, truck horn",
            319: "Reversing beeps",
            320: "Ice cream truck, ice cream van",
            321: "Bus",
            322: "Emergency vehicle",
            323: "Police car (siren)",
            324: "Ambulance (siren)",
            325: "Fire engine, fire truck (siren)",
            326: "Motorcycle",
            327: "Traffic noise, roadway noise",
            328: "Rail transport",
            329: "Train",
            330: "Train whistle",
            331: "Train horn",
            332: "Railroad car, train wagon",
            333: "Train wheels squealing",
            334: "Subway, metro, underground",
            335: "Aircraft",
            336: "Aircraft engine",
            337: "Jet engine",
            338: "Propeller, airscrew",
            339: "Helicopter",
            340: "Fixed-wing aircraft, airplane",
            341: "Bicycle",
            342: "Skateboard",
            343: "Engine",
            344: "Light engine (high frequency)",
            345: "Dental drill, dentist's drill",
            346: "Lawn mower",
            347: "Chainsaw",
            348: "Medium engine (mid frequency)",
            349: "Heavy engine (low frequency)",
            350: "Engine knocking",
            351: "Engine starting",
            352: "Idling",
            353: "Accelerating, revving, vroom",
            354: "Door",
            355: "Doorbell",
            356: "Ding-dong",
            357: "Sliding door",
            358: "Slam",
            359: "Knock",
            360: "Tap",
            361: "Squeak",
            362: "Cupboard open or close",
            363: "Drawer open or close",
            364: "Dishes, pots, and pans",
            365: "Cutlery, silverware",
            366: "Chopping (food)",
            367: "Frying (food)",
            368: "Microwave oven",
            369: "Blender",
            370: "Water tap, faucet",
            371: "Sink (filling or washing)",
            372: "Bathtub (filling or washing)",
            373: "Hair dryer",
            374: "Toilet flush",
            375: "Toothbrush",
            376: "Electric toothbrush",
            377: "Vacuum cleaner",
            378: "Zipper (clothing)",
            379: "Keys jangling",
            380: "Coin (dropping)",
            381: "Scissors",
            382: "Electric shaver, electric razor",
            383: "Shuffling cards",
            384: "Typing",
            385: "Typewriter",
            386: "Computer keyboard",
            387: "Writing",
            388: "Alarm",
            389: "Telephone",
            390: "Telephone bell ringing",
            391: "Ringtone",
            392: "Telephone dialing, DTMF",
            393: "Dial tone",
            394: "Busy signal",
            395: "Alarm clock",
            396: "Siren",
            397: "Civil defense siren",
            398: "Buzzer",
            399: "Smoke detector, smoke alarm",
            400: "Fire alarm",
            401: "Foghorn",
            402: "Whistle",
            403: "Steam whistle",
            404: "Mechanisms",
            405: "Ratchet, pawl",
            406: "Clock",
            407: "Tick",
            408: "Tick-tock",
            409: "Gears",
            410: "Pulleys",
            411: "Sewing machine",
            412: "Mechanical fan",
            413: "Air conditioning",
            414: "Cash register",
            415: "Printer",
            416: "Camera",
            417: "Single-lens reflex camera",
            418: "Tools",
            419: "Hammer",
            420: "Jackhammer",
            421: "Sawing",
            422: "Filing (rasp)",
            423: "Sanding",
            424: "Power tool",
            425: "Drill",
            426: "Explosion",
            427: "Gunshot, gunfire",
            428: "Machine gun",
            429: "Fusillade",
            430: "Artillery fire",
            431: "Cap gun",
            432: "Fireworks",
            433: "Firecracker",
            434: "Burst, pop",
            435: "Eruption",
            436: "Boom",
            437: "Wood",
            438: "Chop",
            439: "Splinter",
            440: "Crack",
            441: "Glass",
            442: "Chink, clink",
            443: "Shatter",
            444: "Liquid",
            445: "Splash, splatter",
            446: "Slosh",
            447: "Squish",
            448: "Drip",
            449: "Pour",
            450: "Trickle, dribble",
            451: "Gush",
            452: "Fill (with liquid)",
            453: "Spray",
            454: "Pump (liquid)",
            455: "Stir",
            456: "Boiling",
            457: "Sonar",
            458: "Arrow",
            459: "Whoosh, swoosh, swish",
            460: "Thump, thud",
            461: "Thunk",
            462: "Electronic tuner",
            463: "Effects unit",
            464: "Chorus effect",
            465: "Basketball bounce",
            466: "Bang",
            467: "Slap, smack",
            468: "Whack, thwack",
            469: "Smash, crash",
            470: "Breaking",
            471: "Bouncing",
            472: "Whip",
            473: "Flap",
            474: "Scratch",
            475: "Scrape",
            476: "Rub",
            477: "Roll",
            478: "Crushing",
            479: "Crumpling, crinkling",
            480: "Tearing",
            481: "Beep, bleep",
            482: "Ping",
            483: "Ding",
            484: "Clang",
            485: "Squeal",
            486: "Creak",
            487: "Rustle",
            488: "Whir",
            489: "Clatter",
            490: "Sizzle",
            491: "Clicking",
            492: "Clickety-clack",
            493: "Rumble",
            494: "Plop",
            495: "Jingle, tinkle",
            496: "Hum",
            497: "Zing",
            498: "Boing",
            499: "Crunch",
            500: "Silence",
            501: "Sine wave",
            502: "Harmonic",
            503: "Chirp tone",
            504: "Sound effect",
            505: "Pulse",
            506: "Inside, small room",
            507: "Inside, large room or hall",
            508: "Inside, public space",
            509: "Outside, urban or manmade",
            510: "Outside, rural or natural",
            511: "Reverberation",
            512: "Echo",
            513: "Noise",
            514: "Environmental noise",
            515: "Static",
            516: "Mains hum",
            517: "Distortion",
            518: "Sidetone",
            519: "Cacophony",
            520: "White noise",
            521: "Pink noise",
            522: "Throbbing",
            523: "Vibration",
            524: "Television",
            525: "Radio",
            526: "Field recording",
        }

    def __enter__(self):
        return self

    def __exit__(self, *args, **kwargs):
        self.close()

    def close(self):
        if self.vggish_sess:
            self.vggish_sess.close()

    def start(self):
        pass

    def stop(self):
        pass

    def process(self, audio_data):
        # convert input data to vggish embedding
        embeddings = self._generate_embeddings(audio_data)
        # pass embedding to model
        label_weights = self._make_prediction(embeddings)
        # output prediction
        self._announce_prediction(label_weights)

    def _generate_embeddings(self, audio_data):
        examples_batch = vggish_input.waveform_to_examples(audio_data/32768.0, self.audio_properties["sample_rate"])
        [embedding_batch] = self.vgg_sess.run(
            [self.embedding_tensor],
            feed_dict={self.features_tensor: examples_batch}
        )
        postprocessed_batch = self.pproc.postprocess(embedding_batch)
        embeddings = postprocessed_batch 
        return embeddings

    def _make_prediction(self, embedding):
        data = self._resize_input(embedding, 0, 300)
        data = np.expand_dims(data, 0)
        num_frames = np.expand_dims(1, 0)
        predictions, = self.model_sess.run(
            [self.predictions_tensor],
            feed_dict={
                self.input_batch_raw_tensor: data,
                self.num_frames_tensor: num_frames
            }
        )
        return predictions[0]

    def _resize_input(self, data, axis, new_size):
        shape = list(data.shape)
        pad_shape = shape[:]
        pad_shape[axis] = np.maximum(0, new_size - shape[axis])
        shape[axis] = np.minimum(shape[axis], new_size)
        shape = np.stack(shape)
        slices = [slice(0, s) for s in shape]
        resized = np.concatenate([
            data[slices],
            np.zeros(np.stack(pad_shape))
        ], axis)

        # Update shape.
        new_shape = list(data.shape)
        new_shape[axis] = new_size
        resized.reshape(new_shape)
        return resized

    def _announce_prediction(self, label_weights):
        # modify this to hook into whatever notification mechanism is applicable for the scenario
        top_indices = np.argpartition(label_weights, -5)[-5:]
        top_indices = sorted(top_indices, key=lambda a: label_weights[a], reverse=True)
        dog_noise = False
        for i in top_indices:
            if (i >= 75 and i <= 80):
                log(str(i) + ": " + self.labels[i] + " - " + str(label_weights[i]))
                dog_noise = True
        if not dog_noise:
            log("no dog noises")


 if __name__ == "__main__":
    args = sys.argv
    deviceIndex = int(args[1]) if len(args) == 2 and args[1].isdigit() else None
    device = ListenerDevice(deviceIndex)
    
    if len(args) == 2 and args[1] == "list":
        device.list_devices()
    else:
        # uncomment this listener to save the audio to a wav file as you speak, good for testing that it's working
        #device.add_listener(ListenerWavSave(device.audio_properties, "recording.wav"))
        # this is the listener which does the dog mood prediction
        device.add_listener(ListenerDogMoodDetector(device.audio_properties))
        device.listen(deviceIndex)
	# inspiration from https://github.com/devicehive/devicehive-audio-analysis

	import sys
	import pyaudio
	import numpy as np
	import wave
	import os
	import tensorflow as tf

	import vggish_input
	import vggish_params
	import vggish_postprocess
	import vggish_slim


	from datetime import datetime
	def log(message):
	time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	print(time_str + ": " + message)


	class ListenerDevice:
	def __init__(self, device_index):
	self.audio = pyaudio.PyAudio()
	self.listeners = []

	if device_index is None:
	device_info = self.audio.get_default_input_device_info()
	else:
	device_info = self.audio.get_device_info_by_index(device_index)

	self.audio_properties = {
	"index": device_info["index"],
	"name": device_info["name"],
	"sample_width": self.audio.get_sample_size(pyaudio.paInt16),
	"sample_rate": int(device_info["defaultSampleRate"]),
	"channels": 1,
	}
	print("selected audio device: ", self.audio_properties)

	def __del__(self):
	self.audio.terminate()

	def list_devices(self):
	audio = self.audio
	default_index = audio.get_default_input_device_info()["index"]
	inputs = [
	audio.get_device_info_by_index(i)
	for i in range(audio.get_device_count())
	if audio.get_device_info_by_index(i).get('maxInputChannels') > 0
	]
	for device in inputs:
	if (device["index"] == default_index):
	print(str(device["index"]) + " (default): " + device["name"])
	else:
	print(str(device["index"]) + ": " + device["name"])

	def add_listener(self, listener):
	self.listeners.append(listener)

	def listen(self, device_index = None):
	record_stream = self.audio.open(
	format=pyaudio.paInt16,
	channels=self.audio_properties["channels"],
	rate=self.audio_properties["sample_rate"],
	frames_per_buffer=self.audio_properties["sample_rate"],
	input=True,
	stream_callback=self._forward_audio_data,
	input_device_index=self.audio_properties["index"],
	)

	for listener in self.listeners:
	listener.start()

	record_stream.start_stream()

	input("Press Enter to stop recording...")

	record_stream.stop_stream()
	record_stream.close()
	for listener in self.listeners:
	listener.stop()


	def _forward_audio_data(self, in_data, frame_count, time_info, status):
	data = np.frombuffer(in_data, dtype=np.int16, count=frame_count)
	for listener in self.listeners:
	listener.process(data)
	return (in_data, pyaudio.paContinue)



	class ListenerWavSave:
	def __init__(self, audio_properties, filename):
	self.audio_properties = audio_properties
	self.filename = filename

	def start(self):
	output_file = wave.open(self.filename, "wb")
	output_file.setnchannels(self.audio_properties["channels"])
	output_file.setsampwidth(self.audio_properties["sample_width"])
	output_file.setframerate(self.audio_properties["sample_rate"])
	self.output_file = output_file

	def stop(self):
	self.output_file.close()

	def process(self, audio_data):
	self.output_file.writeframes(audio_data)



	class ListenerDogMoodDetector:
	def __init__(self, audio_properties, model_path = ""):
	self.audio_properties = audio_properties
	self.pproc = vggish_postprocess.Postprocessor(os.path.join(model_path, "vggish_pca_params.npz"))
	#vggish
	graph_vgg = tf.Graph()
	with graph_vgg.as_default():
	self.vgg_sess = tf.Session()
	vggish_slim.define_vggish_slim(training=False)
	vggish_slim.load_vggish_slim_checkpoint(self.vgg_sess, os.path.join(model_path, "vggish_model.ckpt"))
	self.features_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
	self.embedding_tensor = self.vgg_sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

	graph_yt = tf.Graph()
	with graph_yt.as_default():
	self.model_sess = tf.Session()
	model_file = os.path.join(model_path, "model.ckpt")
	saver = tf.train.import_meta_graph(model_file + ".meta", clear_devices=True, import_scope='m2')
	saver.restore(self.model_sess, model_file)
	init_op_list = []
	variables = tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES)
	for variable in list(variables):
	if "train_input" in variable.name:
	init_op_list.append(tf.assign(variable, 1))
	variables.remove(variable)
	init_op_list.append(tf.variables_initializer(variables))
	self.model_sess.run(init_op_list)
	self.input_batch_raw_tensor = self.model_sess.graph.get_collection("input_batch_raw")[0]
	self.num_frames_tensor = self.model_sess.graph.get_collection("num_frames")[0]
	self.predictions_tensor = self.model_sess.graph.get_collection("predictions")[0]

	self.labels = {
	0: "Speech",
	1: "Male speech, man speaking",
	2: "Female speech, woman speaking",
	3: "Child speech, kid speaking",
	4: "Conversation",
	5: "Narration, monologue",
	6: "Babbling",
	7: "Speech synthesizer",
	8: "Shout",
	9: "Bellow",
	10: "Whoop",
	11: "Yell",
	12: "Battle cry",
	13: "Children shouting",
	14: "Screaming",
	15: "Whispering",
	16: "Laughter",
	17: "Baby laughter",
	18: "Giggle",
	19: "Snicker",
	20: "Belly laugh",
	21: "Chuckle, chortle",
	22: "Crying, sobbing",
	23: "Baby cry, infant cry",
	24: "Whimper",
	25: "Wail, moan",
	26: "Sigh",
	27: "Singing",
	28: "Choir",
	29: "Yodeling",
	30: "Chant",
	31: "Mantra",
	32: "Male singing",
	33: "Female singing",
	34: "Child singing",
	35: "Synthetic singing",
	36: "Rapping",
	37: "Humming",
	38: "Groan",
	39: "Grunt",
	40: "Whistling",
	41: "Breathing",
	42: "Wheeze",
	43: "Snoring",
	44: "Gasp",
	45: "Pant",
	46: "Snort",
	47: "Cough",
	48: "Throat clearing",
	49: "Sneeze",
	50: "Sniff",
	51: "Run",
	52: "Shuffle",
	53: "Walk, footsteps",
	54: "Chewing, mastication",
	55: "Biting",
	56: "Gargling",
	57: "Stomach rumble",
	58: "Burping, eructation",
	59: "Hiccup",
	60: "Fart",
	61: "Hands",
	62: "Finger snapping",
	63: "Clapping",
	64: "Heart sounds, heartbeat",
	65: "Heart murmur",
	66: "Cheering",
	67: "Applause",
	68: "Chatter",
	69: "Crowd",
	70: "Hubbub, speech noise, speech babble",
	71: "Children playing",
	72: "Animal",
	73: "Domestic animals, pets",
	74: "Dog",
	75: "Bark",
	76: "Yip",
	77: "Howl",
	78: "Bow-wow",
	79: "Growling",
	80: "Whimper (dog)",
	81: "Cat",
	82: "Purr",
	83: "Meow",
	84: "Hiss",
	85: "Caterwaul",
	86: "Livestock, farm animals, working animals",
	87: "Horse",
	88: "Clip-clop",
	89: "Neigh, whinny",
	90: "Cattle, bovinae",
	91: "Moo",
	92: "Cowbell",
	93: "Pig",
	94: "Oink",
	95: "Goat",
	96: "Bleat",
	97: "Sheep",
	98: "Fowl",
	99: "Chicken, rooster",
	100: "Cluck",
	101: "Crowing, cock-a-doodle-doo",
	102: "Turkey",
	103: "Gobble",
	104: "Duck",
	105: "Quack",
	106: "Goose",
	107: "Honk",
	108: "Wild animals",
	109: "Roaring cats (lions, tigers)",
	110: "Roar",
	111: "Bird",
	112: "Bird vocalization, bird call, bird song",
	113: "Chirp, tweet",
	114: "Squawk",
	115: "Pigeon, dove",
	116: "Coo",
	117: "Crow",
	118: "Caw",
	119: "Owl",
	120: "Hoot",
	121: "Bird flight, flapping wings",
	122: "Canidae, dogs, wolves",
	123: "Rodents, rats, mice",
	124: "Mouse",
	125: "Patter",
	126: "Insect",
	127: "Cricket",
	128: "Mosquito",
	129: "Fly, housefly",
	130: "Buzz",
	131: "Bee, wasp, etc.",
	132: "Frog",
	133: "Croak",
	134: "Snake",
	135: "Rattle",
	136: "Whale vocalization",
	137: "Music",
	138: "Musical instrument",
	139: "Plucked string instrument",
	140: "Guitar",
	141: "Electric guitar",
	142: "Bass guitar",
	143: "Acoustic guitar",
	144: "Steel guitar, slide guitar",
	145: "Tapping (guitar technique)",
	146: "Strum",
	147: "Banjo",
	148: "Sitar",
	149: "Mandolin",
	150: "Zither",
	151: "Ukulele",
	152: "Keyboard (musical)",
	153: "Piano",
	154: "Electric piano",
	155: "Organ",
	156: "Electronic organ",
	157: "Hammond organ",
	158: "Synthesizer",
	159: "Sampler",
	160: "Harpsichord",
	161: "Percussion",
	162: "Drum kit",
	163: "Drum machine",
	164: "Drum",
	165: "Snare drum",
	166: "Rimshot",
	167: "Drum roll",
	168: "Bass drum",
	169: "Timpani",
	170: "Tabla",
	171: "Cymbal",
	172: "Hi-hat",
	173: "Wood block",
	174: "Tambourine",
	175: "Rattle (instrument)",
	176: "Maraca",
	177: "Gong",
	178: "Tubular bells",
	179: "Mallet percussion",
	180: "Marimba, xylophone",
	181: "Glockenspiel",
	182: "Vibraphone",
	183: "Steelpan",
	184: "Orchestra",
	185: "Brass instrument",
	186: "French horn",
	187: "Trumpet",
	188: "Trombone",
	189: "Bowed string instrument",
	190: "String section",
	191: "Violin, fiddle",
	192: "Pizzicato",
	193: "Cello",
	194: "Double bass",
	195: "Wind instrument, woodwind instrument",
	196: "Flute",
	197: "Saxophone",
	198: "Clarinet",
	199: "Harp",
	200: "Bell",
	201: "Church bell",
	202: "Jingle bell",
	203: "Bicycle bell",
	204: "Tuning fork",
	205: "Chime",
	206: "Wind chime",
	207: "Change ringing (campanology)",
	208: "Harmonica",
	209: "Accordion",
	210: "Bagpipes",
	211: "Didgeridoo",
	212: "Shofar",
	213: "Theremin",
	214: "Singing bowl",
	215: "Scratching (performance technique)",
	216: "Pop music",
	217: "Hip hop music",
	218: "Beatboxing",
	219: "Rock music",
	220: "Heavy metal",
	221: "Punk rock",
	222: "Grunge",
	223: "Progressive rock",
	224: "Rock and roll",
	225: "Psychedelic rock",
	226: "Rhythm and blues",
	227: "Soul music",
	228: "Reggae",
	229: "Country",
	230: "Swing music",
	231: "Bluegrass",
	232: "Funk",
	233: "Folk music",
	234: "Middle Eastern music",
	235: "Jazz",
	236: "Disco",
	237: "Classical music",
	238: "Opera",
	239: "Electronic music",
	240: "House music",
	241: "Techno",
	242: "Dubstep",
	243: "Drum and bass",
	244: "Electronica",
	245: "Electronic dance music",
	246: "Ambient music",
	247: "Trance music",
	248: "Music of Latin America",
	249: "Salsa music",
	250: "Flamenco",
	251: "Blues",
	252: "Music for children",
	253: "New-age music",
	254: "Vocal music",
	255: "A capella",
	256: "Music of Africa",
	257: "Afrobeat",
	258: "Christian music",
	259: "Gospel music",
	260: "Music of Asia",
	261: "Carnatic music",
	262: "Music of Bollywood",
	263: "Ska",
	264: "Traditional music",
	265: "Independent music",
	266: "Song",
	267: "Background music",
	268: "Theme music",
	269: "Jingle (music)",
	270: "Soundtrack music",
	271: "Lullaby",
	272: "Video game music",
	273: "Christmas music",
	274: "Dance music",
	275: "Wedding music",
	276: "Happy music",
	277: "Funny music",
	278: "Sad music",
	279: "Tender music",
	280: "Exciting music",
	281: "Angry music",
	282: "Scary music",
	283: "Wind",
	284: "Rustling leaves",
	285: "Wind noise (microphone)",
	286: "Thunderstorm",
	287: "Thunder",
	288: "Water",
	289: "Rain",
	290: "Raindrop",
	291: "Rain on surface",
	292: "Stream",
	293: "Waterfall",
	294: "Ocean",
	295: "Waves, surf",
	296: "Steam",
	297: "Gurgling",
	298: "Fire",
	299: "Crackle",
	300: "Vehicle",
	301: "Boat, Water vehicle",
	302: "Sailboat, sailing ship",
	303: "Rowboat, canoe, kayak",
	304: "Motorboat, speedboat",
	305: "Ship",
	306: "Motor vehicle (road)",
	307: "Car",
	308: "Vehicle horn, car horn, honking",
	309: "Toot",
	310: "Car alarm",
	311: "Power windows, electric windows",
	312: "Skidding",
	313: "Tire squeal",
	314: "Car passing by",
	315: "Race car, auto racing",
	316: "Truck",
	317: "Air brake",
	318: "Air horn, truck horn",
	319: "Reversing beeps",
	320: "Ice cream truck, ice cream van",
	321: "Bus",
	322: "Emergency vehicle",
	323: "Police car (siren)",
	324: "Ambulance (siren)",
	325: "Fire engine, fire truck (siren)",
	326: "Motorcycle",
	327: "Traffic noise, roadway noise",
	328: "Rail transport",
	329: "Train",
	330: "Train whistle",
	331: "Train horn",
	332: "Railroad car, train wagon",
	333: "Train wheels squealing",
	334: "Subway, metro, underground",
	335: "Aircraft",
	336: "Aircraft engine",
	337: "Jet engine",
	338: "Propeller, airscrew",
	339: "Helicopter",
	340: "Fixed-wing aircraft, airplane",
	341: "Bicycle",
	342: "Skateboard",
	343: "Engine",
	344: "Light engine (high frequency)",
	345: "Dental drill, dentist's drill",
	346: "Lawn mower",
	347: "Chainsaw",
	348: "Medium engine (mid frequency)",
	349: "Heavy engine (low frequency)",
	350: "Engine knocking",
	351: "Engine starting",
	352: "Idling",
	353: "Accelerating, revving, vroom",
	354: "Door",
	355: "Doorbell",
	356: "Ding-dong",
	357: "Sliding door",
	358: "Slam",
	359: "Knock",
	360: "Tap",
	361: "Squeak",
	362: "Cupboard open or close",
	363: "Drawer open or close",
	364: "Dishes, pots, and pans",
	365: "Cutlery, silverware",
	366: "Chopping (food)",
	367: "Frying (food)",
	368: "Microwave oven",
	369: "Blender",
	370: "Water tap, faucet",
	371: "Sink (filling or washing)",
	372: "Bathtub (filling or washing)",
	373: "Hair dryer",
	374: "Toilet flush",
	375: "Toothbrush",
	376: "Electric toothbrush",
	377: "Vacuum cleaner",
	378: "Zipper (clothing)",
	379: "Keys jangling",
	380: "Coin (dropping)",
	381: "Scissors",
	382: "Electric shaver, electric razor",
	383: "Shuffling cards",
	384: "Typing",
	385: "Typewriter",
	386: "Computer keyboard",
	387: "Writing",
	388: "Alarm",
	389: "Telephone",
	390: "Telephone bell ringing",
	391: "Ringtone",
	392: "Telephone dialing, DTMF",
	393: "Dial tone",
	394: "Busy signal",
	395: "Alarm clock",
	396: "Siren",
	397: "Civil defense siren",
	398: "Buzzer",
	399: "Smoke detector, smoke alarm",
	400: "Fire alarm",
	401: "Foghorn",
	402: "Whistle",
	403: "Steam whistle",
	404: "Mechanisms",
	405: "Ratchet, pawl",
	406: "Clock",
	407: "Tick",
	408: "Tick-tock",
	409: "Gears",
	410: "Pulleys",
	411: "Sewing machine",
	412: "Mechanical fan",
	413: "Air conditioning",
	414: "Cash register",
	415: "Printer",
	416: "Camera",
	417: "Single-lens reflex camera",
	418: "Tools",
	419: "Hammer",
	420: "Jackhammer",
	421: "Sawing",
	422: "Filing (rasp)",
	423: "Sanding",
	424: "Power tool",
	425: "Drill",
	426: "Explosion",
	427: "Gunshot, gunfire",
	428: "Machine gun",
	429: "Fusillade",
	430: "Artillery fire",
	431: "Cap gun",
	432: "Fireworks",
	433: "Firecracker",
	434: "Burst, pop",
	435: "Eruption",
	436: "Boom",
	437: "Wood",
	438: "Chop",
	439: "Splinter",
	440: "Crack",
	441: "Glass",
	442: "Chink, clink",
	443: "Shatter",
	444: "Liquid",
	445: "Splash, splatter",
	446: "Slosh",
	447: "Squish",
	448: "Drip",
	449: "Pour",
	450: "Trickle, dribble",
	451: "Gush",
	452: "Fill (with liquid)",
	453: "Spray",
	454: "Pump (liquid)",
	455: "Stir",
	456: "Boiling",
	457: "Sonar",
	458: "Arrow",
	459: "Whoosh, swoosh, swish",
	460: "Thump, thud",
	461: "Thunk",
	462: "Electronic tuner",
	463: "Effects unit",
	464: "Chorus effect",
	465: "Basketball bounce",
	466: "Bang",
	467: "Slap, smack",
	468: "Whack, thwack",
	469: "Smash, crash",
	470: "Breaking",
	471: "Bouncing",
	472: "Whip",
	473: "Flap",
	474: "Scratch",
	475: "Scrape",
	476: "Rub",
	477: "Roll",
	478: "Crushing",
	479: "Crumpling, crinkling",
	480: "Tearing",
	481: "Beep, bleep",
	482: "Ping",
	483: "Ding",
	484: "Clang",
	485: "Squeal",
	486: "Creak",
	487: "Rustle",
	488: "Whir",
	489: "Clatter",
	490: "Sizzle",
	491: "Clicking",
	492: "Clickety-clack",
	493: "Rumble",
	494: "Plop",
	495: "Jingle, tinkle",
	496: "Hum",
	497: "Zing",
	498: "Boing",
	499: "Crunch",
	500: "Silence",
	501: "Sine wave",
	502: "Harmonic",
	503: "Chirp tone",
	504: "Sound effect",
	505: "Pulse",
	506: "Inside, small room",
	507: "Inside, large room or hall",
	508: "Inside, public space",
	509: "Outside, urban or manmade",
	510: "Outside, rural or natural",
	511: "Reverberation",
	512: "Echo",
	513: "Noise",
	514: "Environmental noise",
	515: "Static",
	516: "Mains hum",
	517: "Distortion",
	518: "Sidetone",
	519: "Cacophony",
	520: "White noise",
	521: "Pink noise",
	522: "Throbbing",
	523: "Vibration",
	524: "Television",
	525: "Radio",
	526: "Field recording",
	}

	def __enter__(self):
	return self

	def __exit__(self, args, *kwargs):
	self.close()

	def close(self):
	if self.vggish_sess:
	self.vggish_sess.close()

	def start(self):
	pass

	def stop(self):
	pass

	def process(self, audio_data):
	# convert input data to vggish embedding
	embeddings = self._generate_embeddings(audio_data)
	# pass embedding to model
	label_weights = self._make_prediction(embeddings)
	# output prediction
	self._announce_prediction(label_weights)

	def _generate_embeddings(self, audio_data):
	examples_batch = vggish_input.waveform_to_examples(audio_data/32768.0, self.audio_properties["sample_rate"])
	[embedding_batch] = self.vgg_sess.run(
	[self.embedding_tensor],
	feed_dict={self.features_tensor: examples_batch}
	)
	postprocessed_batch = self.pproc.postprocess(embedding_batch)
	embeddings = postprocessed_batch
	return embeddings

	def _make_prediction(self, embedding):
	data = self._resize_input(embedding, 0, 300)
	data = np.expand_dims(data, 0)
	num_frames = np.expand_dims(1, 0)
	predictions, = self.model_sess.run(
	[self.predictions_tensor],
	feed_dict={
	self.input_batch_raw_tensor: data,
	self.num_frames_tensor: num_frames
	}
	)
	return predictions[0]

	def _resize_input(self, data, axis, new_size):
	shape = list(data.shape)
	pad_shape = shape[:]
	pad_shape[axis] = np.maximum(0, new_size - shape[axis])
	shape[axis] = np.minimum(shape[axis], new_size)
	shape = np.stack(shape)
	slices = [slice(0, s) for s in shape]
	resized = np.concatenate([
	data[slices],
	np.zeros(np.stack(pad_shape))
	], axis)

	# Update shape.
	new_shape = list(data.shape)
	new_shape[axis] = new_size
	resized.reshape(new_shape)
	return resized

	def _announce_prediction(self, label_weights):
	# modify this to hook into whatever notification mechanism is applicable for the scenario
	top_indices = np.argpartition(label_weights, -5)[-5:]
	top_indices = sorted(top_indices, key=lambda a: label_weights[a], reverse=True)
	dog_noise = False
	for i in top_indices:
	if (i >= 75 and i <= 80):
	log(str(i) + ": " + self.labels[i] + " - " + str(label_weights[i]))
	dog_noise = True
	if not dog_noise:
	log("no dog noises")


	if __name__ == "__main__":
	args = sys.argv
	deviceIndex = int(args[1]) if len(args) == 2 and args[1].isdigit() else None
	device = ListenerDevice(deviceIndex)

	if len(args) == 2 and args[1] == "list":
	device.list_devices()
	else:
	# uncomment this listener to save the audio to a wav file as you speak, good for testing that it's working
	#device.add_listener(ListenerWavSave(device.audio_properties, "recording.wav"))
	# this is the listener which does the dog mood prediction
	device.add_listener(ListenerDogMoodDetector(device.audio_properties))
	device.listen(deviceIndex)