hiepxanh · January 27, 2025 03:41 · hiepxanh · Jan 27, 2025
diff --git a/kokoro-deepinfra.js b/kokoro-deepinfra.js
 import { getRequestHeaders } from '../../../script.js';
 import { callGenericPopup, POPUP_RESULT, POPUP_TYPE } from '../../popup.js';
 import { getPreviewString, saveTtsProviderSettings } from './index.js';

 export { KokoroTtsProvider };

 class KokoroTtsProvider {
    settings;
    voices = [];
    separator = ' . ';

    audioElement = document.createElement('audio');

    defaultSettings = {
        voiceMap: {},
        model: 'kokoro', // Default to your model
        speed: 1,
        available_voices: ['af_sky', 'af_bella', 'af', 'af_nicole', 'af_sarah', 'af_sky+af_bella', 'af_sky+af_nicole', 'af_sky+af_nicole+af_bella', 'bf_emma', 'bf_isabella', 'af_sky+af_nicole+af_bella+bf_isabella', 'bf_isabella+af_sky+af_nicole+af_bella+bf_isabella', 'bf_isabella+bf_isabella+bf_isabella+af_sky+af_nicole+af_bella+bf_isabella'], // Voices supported by your model
        provider_endpoint: 'http://localhost:8880/v1/audio/speech', // Your Python API endpoint
    };


    get settingsHtml() {
        let html = `
        <label for="openai_compatible_tts_endpoint">Provider Endpoint:</label>
        <div class="flex-container alignItemsCenter">
            <div class="flex1">
                <input id="openai_compatible_tts_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
            </div>
            <div id="openai_compatible_tts_key" class="menu_button menu_button_icon">
                <i class="fa-solid fa-key"></i>
                <span>API Key</span>
            </div>
        </div>
        <label for="openai_compatible_model">Model:</label>
        <input id="openai_compatible_model" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.model}"/>
        <label for="openai_compatible_tts_voices">Available Voices (comma separated):</label>
        <input id="openai_compatible_tts_voices" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.available_voices.join()}"/>
        <label for="openai_compatible_tts_speed">Speed: <span id="openai_compatible_tts_speed_output"></span></label>
        <input type="range" id="openai_compatible_tts_speed" value="1" min="0.25" max="4" step="0.05">`;
        return html;
    }

    async loadSettings(settings) {
        // Populate Provider UI given input settings
        if (Object.keys(settings).length == 0) {
            console.info('Using default TTS Provider settings');
        }

        // Only accept keys defined in defaultSettings
        this.settings = this.defaultSettings;

        for (const key in settings) {
            if (key in this.settings) {
                this.settings[key] = settings[key];
            } else {
                throw `Invalid setting passed to TTS Provider: ${key}`;
            }
        }

        $('#openai_compatible_tts_endpoint').val(this.settings.provider_endpoint);
        $('#openai_compatible_tts_endpoint').on('input', () => { this.onSettingsChange(); });

        $('#openai_compatible_model').val(this.defaultSettings.model);
        $('#openai_compatible_model').on('input', () => { this.onSettingsChange(); });

        $('#openai_compatible_tts_voices').val(this.settings.available_voices.join());
        $('#openai_compatible_tts_voices').on('input', () => { this.onSettingsChange(); });

        $('#openai_compatible_tts_speed').val(this.settings.speed);
        $('#openai_compatible_tts_speed').on('input', () => {
            this.onSettingsChange();
        });

        $('#openai_compatible_tts_speed_output').text(this.settings.speed);


        await this.checkReady();

        console.debug('OpenAI Compatible TTS: Settings loaded');
    }

    onSettingsChange() {
        // Update dynamically
        this.settings.provider_endpoint = String($('#openai_compatible_tts_endpoint').val());
        this.settings.model = String($('#openai_compatible_model').val());
        this.settings.available_voices = String($('#openai_compatible_tts_voices').val()).split(',');
        this.settings.speed = Number($('#openai_compatible_tts_speed').val());
        $('#openai_compatible_tts_speed_output').text(this.settings.speed);
        saveTtsProviderSettings();
    }

    async checkReady() {
        await this.fetchTtsVoiceObjects();
    }

    async onRefreshClick() {
        return;
    }

    async getVoice(voiceName) {
        if (this.voices.length == 0) {
            this.voices = await this.fetchTtsVoiceObjects();
        }
        const match = this.voices.filter(
            oaicVoice => oaicVoice.name == voiceName,
        )[0];
        if (!match) {
            throw `TTS Voice name ${voiceName} not found`;
        }
        return match;
    }

    async generateTts(text, voiceId) {
        const response = await this.fetchTtsGeneration(text, voiceId);
        return response;
    }

    async fetchTtsVoiceObjects() {
        return this.settings.available_voices.map(v => {
            return { name: v, voice_id: v, lang: 'en-US' };
        });
    }

    async previewTtsVoice(voiceId) {
        this.audioElement.pause();
        this.audioElement.currentTime = 0;

        const text = getPreviewString('en-US');
        const response = await this.fetchTtsGeneration(text, voiceId);
        if (!response.ok) {
            throw new Error(`HTTP ${response.status}`);
        }

        const audio = await response.blob();
        const url = URL.createObjectURL(audio);
        this.audioElement.src = url;
        this.audioElement.play();
        this.audioElement.onended = () => URL.revokeObjectURL(url);
    }

    async fetchTtsGeneration(inputText, voiceId) {
    console.info(`Generating new TTS for voice_id ${voiceId}`);
    const response = await fetch(this.settings.provider_endpoint, {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json'
        },
        body: JSON.stringify({
            model: this.settings.model,
            voice: voiceId,
            text: inputText,
            response_format: 'wav',
            speed: this.settings.speed,
        }),
    });

    if (!response.ok) {
        const responseText = await response.text();
        console.error('Response Text:', responseText);
        toastr.error(response.statusText, 'TTS Generation Failed');
        throw new Error(`HTTP ${response.status}: ${responseText}`);
    }

    const result = await response.json();
    console.log('Result:', result);

    if (!result.audio) {
        throw new Error('Audio field is missing or null in the response');
    }

    try {
        // Clean up the base64 string
        let base64String = result.audio;
        if (base64String.includes(',')) {
            base64String = base64String.split(',')[1];
        }
        base64String = base64String.replace(/-/g, '+').replace(/_/g, '/');
        while (base64String.length % 4) {
            base64String += '=';
        }

        // Convert base64 to raw binary data
        const binaryString = atob(base64String);
        const bytes = new Uint8Array(binaryString.length);
        for (let i = 0; i < binaryString.length; i++) {
            bytes[i] = binaryString.charCodeAt(i);
        }

        // Create blob with proper MIME type
        const mimeType = result.response_format === 'mp3' ? 'audio/mpeg' : 'audio/wav';
        const audioBlob = new Blob([bytes], { type: mimeType });

        // Create a Response object from the Blob
        return new Response(audioBlob, {
            status: 200,
            headers: {
                'Content-Type': mimeType
            }
        });

    } catch (error) {
        console.error('Base64 decoding error:', error);
        console.error('Base64 string preview:', result.audio.substring(0, 100) + '...');
        throw new Error('Failed to decode base64 audio data: ' + error.message);
    }
 }
 }
	import { getRequestHeaders } from '../../../script.js';
	import { callGenericPopup, POPUP_RESULT, POPUP_TYPE } from '../../popup.js';
	import { getPreviewString, saveTtsProviderSettings } from './index.js';

	export { KokoroTtsProvider };

	class KokoroTtsProvider {
	settings;
	voices = [];
	separator = ' . ';

	audioElement = document.createElement('audio');

	defaultSettings = {
	voiceMap: {},
	model: 'kokoro', // Default to your model
	speed: 1,
	available_voices: ['af_sky', 'af_bella', 'af', 'af_nicole', 'af_sarah', 'af_sky+af_bella', 'af_sky+af_nicole', 'af_sky+af_nicole+af_bella', 'bf_emma', 'bf_isabella', 'af_sky+af_nicole+af_bella+bf_isabella', 'bf_isabella+af_sky+af_nicole+af_bella+bf_isabella', 'bf_isabella+bf_isabella+bf_isabella+af_sky+af_nicole+af_bella+bf_isabella'], // Voices supported by your model
	provider_endpoint: 'http://localhost:8880/v1/audio/speech', // Your Python API endpoint
	};


	get settingsHtml() {
	let html = `
	<label for="openai_compatible_tts_endpoint">Provider Endpoint:</label>
	<div class="flex-container alignItemsCenter">
	<div class="flex1">
	<input id="openai_compatible_tts_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
	</div>
	<div id="openai_compatible_tts_key" class="menu_button menu_button_icon">
	<i class="fa-solid fa-key"></i>
	<span>API Key</span>
	</div>
	</div>
	<label for="openai_compatible_model">Model:</label>
	<input id="openai_compatible_model" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.model}"/>
	<label for="openai_compatible_tts_voices">Available Voices (comma separated):</label>
	<input id="openai_compatible_tts_voices" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.available_voices.join()}"/>
	<label for="openai_compatible_tts_speed">Speed: <span id="openai_compatible_tts_speed_output"></span></label>
	<input type="range" id="openai_compatible_tts_speed" value="1" min="0.25" max="4" step="0.05">`;
	return html;
	}

	async loadSettings(settings) {
	// Populate Provider UI given input settings
	if (Object.keys(settings).length == 0) {
	console.info('Using default TTS Provider settings');
	}

	// Only accept keys defined in defaultSettings
	this.settings = this.defaultSettings;

	for (const key in settings) {
	if (key in this.settings) {
	this.settings[key] = settings[key];
	} else {
	throw `Invalid setting passed to TTS Provider: ${key}`;
	}
	}

	$('#openai_compatible_tts_endpoint').val(this.settings.provider_endpoint);
	$('#openai_compatible_tts_endpoint').on('input', () => { this.onSettingsChange(); });

	$('#openai_compatible_model').val(this.defaultSettings.model);
	$('#openai_compatible_model').on('input', () => { this.onSettingsChange(); });

	$('#openai_compatible_tts_voices').val(this.settings.available_voices.join());
	$('#openai_compatible_tts_voices').on('input', () => { this.onSettingsChange(); });

	$('#openai_compatible_tts_speed').val(this.settings.speed);
	$('#openai_compatible_tts_speed').on('input', () => {
	this.onSettingsChange();
	});

	$('#openai_compatible_tts_speed_output').text(this.settings.speed);


	await this.checkReady();

	console.debug('OpenAI Compatible TTS: Settings loaded');
	}

	onSettingsChange() {
	// Update dynamically
	this.settings.provider_endpoint = String($('#openai_compatible_tts_endpoint').val());
	this.settings.model = String($('#openai_compatible_model').val());
	this.settings.available_voices = String($('#openai_compatible_tts_voices').val()).split(',');
	this.settings.speed = Number($('#openai_compatible_tts_speed').val());
	$('#openai_compatible_tts_speed_output').text(this.settings.speed);
	saveTtsProviderSettings();
	}

	async checkReady() {
	await this.fetchTtsVoiceObjects();
	}

	async onRefreshClick() {
	return;
	}

	async getVoice(voiceName) {
	if (this.voices.length == 0) {
	this.voices = await this.fetchTtsVoiceObjects();
	}
	const match = this.voices.filter(
	oaicVoice => oaicVoice.name == voiceName,
	)[0];
	if (!match) {
	throw `TTS Voice name ${voiceName} not found`;
	}
	return match;
	}

	async generateTts(text, voiceId) {
	const response = await this.fetchTtsGeneration(text, voiceId);
	return response;
	}

	async fetchTtsVoiceObjects() {
	return this.settings.available_voices.map(v => {
	return { name: v, voice_id: v, lang: 'en-US' };
	});
	}

	async previewTtsVoice(voiceId) {
	this.audioElement.pause();
	this.audioElement.currentTime = 0;

	const text = getPreviewString('en-US');
	const response = await this.fetchTtsGeneration(text, voiceId);
	if (!response.ok) {
	throw new Error(`HTTP ${response.status}`);
	}

	const audio = await response.blob();
	const url = URL.createObjectURL(audio);
	this.audioElement.src = url;
	this.audioElement.play();
	this.audioElement.onended = () => URL.revokeObjectURL(url);
	}

	async fetchTtsGeneration(inputText, voiceId) {
	console.info(`Generating new TTS for voice_id ${voiceId}`);
	const response = await fetch(this.settings.provider_endpoint, {
	method: 'POST',
	headers: {
	'Content-Type': 'application/json'
	},
	body: JSON.stringify({
	model: this.settings.model,
	voice: voiceId,
	text: inputText,
	response_format: 'wav',
	speed: this.settings.speed,
	}),
	});

	if (!response.ok) {
	const responseText = await response.text();
	console.error('Response Text:', responseText);
	toastr.error(response.statusText, 'TTS Generation Failed');
	throw new Error(`HTTP ${response.status}: ${responseText}`);
	}

	const result = await response.json();
	console.log('Result:', result);

	if (!result.audio) {
	throw new Error('Audio field is missing or null in the response');
	}

	try {
	// Clean up the base64 string
	let base64String = result.audio;
	if (base64String.includes(',')) {
	base64String = base64String.split(',')[1];
	}
	base64String = base64String.replace(/-/g, '+').replace(/_/g, '/');
	while (base64String.length % 4) {
	base64String += '=';
	}

	// Convert base64 to raw binary data
	const binaryString = atob(base64String);
	const bytes = new Uint8Array(binaryString.length);
	for (let i = 0; i < binaryString.length; i++) {
	bytes[i] = binaryString.charCodeAt(i);
	}

	// Create blob with proper MIME type
	const mimeType = result.response_format === 'mp3' ? 'audio/mpeg' : 'audio/wav';
	const audioBlob = new Blob([bytes], { type: mimeType });

	// Create a Response object from the Blob
	return new Response(audioBlob, {
	status: 200,
	headers: {
	'Content-Type': mimeType
	}
	});

	} catch (error) {
	console.error('Base64 decoding error:', error);
	console.error('Base64 string preview:', result.audio.substring(0, 100) + '...');
	throw new Error('Failed to decode base64 audio data: ' + error.message);
	}
	}
	}