-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Web Speech API #661
base: v2-dev
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import { create } from 'zustand'; | ||
import { persist } from 'zustand/middleware'; | ||
import { useShallow } from 'zustand/react/shallow'; | ||
import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities'; | ||
|
||
import type { DLLMId } from '~/common/stores/llms/llms.types'; | ||
|
||
|
@@ -51,6 +52,12 @@ interface AppChatStore { | |
micTimeoutMs: number; | ||
setMicTimeoutMs: (micTimeoutMs: number) => void; | ||
|
||
TTSEngine: string; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for now this could be: TTSEngine: 'elevenlabs' | 'webspeech', to force typescript to do its job. |
||
setTTSEngine: (TTSEngine: string) => void; | ||
|
||
ASREngine: string; | ||
setASREngine: (ASREngine: string) => void; | ||
|
||
showPersonaIcons: boolean; | ||
setShowPersonaIcons: (showPersonaIcons: boolean) => void; | ||
|
||
|
@@ -114,6 +121,12 @@ const useAppChatStore = create<AppChatStore>()(persist( | |
micTimeoutMs: 2000, | ||
setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }), | ||
|
||
TTSEngine: TTSEngineList[0], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if TTSEngine: 'elevenlabs' | 'webspeech', then this become one of the two values (probably 'WebSpeech' by default) -- then the conversion to a nice string can be done in the settings UI, and in the code we only match against those IDs. As an alternative this could be left |
||
setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }), | ||
|
||
ASREngine: ASREngineList[0], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same, we could keep an |
||
setASREngine: (ASREngine: string) => _set({ ASREngine }), | ||
|
||
showPersonaIcons: true, | ||
setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }), | ||
|
||
|
@@ -198,6 +211,13 @@ export const useChatMicTimeoutMsValue = (): number => | |
export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] => | ||
useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs])); | ||
|
||
export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] => | ||
useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine])); | ||
export const getTTSEngine = () => useAppChatStore.getState().TTSEngine; | ||
|
||
export const useASREngine = (): [string, (micTimeoutMs: string) => void] => | ||
useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine])); | ||
|
||
export const useChatDrawerFilters = () => { | ||
const values = useAppChatStore(useShallow(state => ({ | ||
filterHasDocFragments: state.filterHasDocFragments, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import { getTTSEngine } from 'src/apps/chat/store-app-chat'; | ||
import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; | ||
|
||
import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client' | ||
import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client' | ||
import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client' | ||
|
||
import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' | ||
import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' | ||
import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' | ||
|
||
import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; | ||
import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown'; | ||
|
||
export const TTSEngineList: string[] = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See other note on this. I wonder if this would be better served as a map: export type TTSEngineKey = 'elevenlabs' | 'webspeech';
const TTSEngineList: { [key in TTSEngineKey]: { label: string, priority: number } } = {
'elevenlabs': {
label: 'ElevenLabs',
priority: 2,
},
'webspeech': {
label: 'Web Speech API',
priority: 1,
},
}; I've added an attribute called 'priority' to show how we can extend it in the future, for instance with default configurations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After a try. I think use map for extending is a good idea, but i suggest use a list contained map: export type TTSEngineKey = 'Elevenlabs' | 'Web Speech API';
export type ASREngineKey = 'Web Speech API';
export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [
{
key: 'Elevenlabs',
label: 'ElevenLabs',
},
{
key: 'Web Speech API',
label: 'Web Speech API',
},
]; because there are only 2 case using the variable now, one is |
||
'Elevenlabs', | ||
'Web Speech API' | ||
] | ||
|
||
export const ASREngineList: string[] = [ | ||
'Web Speech API' | ||
] | ||
|
||
export function getConditionalVoices(){ | ||
const TTSEngine = getTTSEngine(); | ||
if (TTSEngine === 'Elevenlabs') { | ||
return useElevenLabsVoices | ||
}else if (TTSEngine === 'Web Speech API') { | ||
return useBrowserSpeechVoices | ||
} | ||
throw new Error('TTSEngine is not found'); | ||
} | ||
|
||
export function hasVoices(): boolean { | ||
console.log('getConditionalVoices', getConditionalVoices()().hasVoices) | ||
return getConditionalVoices()().hasVoices; | ||
} | ||
|
||
export function getConditionalCapability(): () => CapabilitySpeechSynthesis { | ||
const TTSEngine = getTTSEngine(); | ||
if (TTSEngine === 'Elevenlabs') { | ||
return useElevenlabsCapability | ||
}else if (TTSEngine === 'Web Speech API') { | ||
return useBrowserSpeechSynthesisCapability | ||
} | ||
throw new Error('TTSEngine is not found'); | ||
} | ||
|
||
export function useCapability(): CapabilitySpeechSynthesis { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Crash issue identified to this hook (the one that gave the black screen in the screenshot). Seems that when switching provider, there's a react out-of-order issue. Only when switching TTS providers I believe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible that to fix this properly, we may have to overhaul the ttsengine reactivity (hooks) |
||
return getConditionalCapability()(); | ||
} | ||
|
||
|
||
export async function speakText(text: string, voiceId?: string) { | ||
const TTSEngine = getTTSEngine(); | ||
if (TTSEngine === 'Elevenlabs') { | ||
return await elevenlabsSpeakText(text, voiceId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a way of doing it. I wonder if we should have a common Interface, and have all the TTS providers (e.g. OpenAI has 2 as well, one via their TTS and one in the new [audio] models), and there's Play.ht and more. A way to reduce the switch-cases (or ifs) will be to have a common interface, such as ISpeechSynthesis, that will be returned by getTTSEngine (basically instead of the string it would return an object that implements the interface). Just a thought. |
||
}else if (TTSEngine === 'Web Speech API') { | ||
return await browserSpeechSynthesisSpeakText(text, voiceId); | ||
} | ||
throw new Error('TTSEngine is not found'); | ||
} | ||
|
||
// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; | ||
|
||
export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { | ||
const TTSEngine = getTTSEngine(); | ||
if (TTSEngine === 'Elevenlabs') { | ||
return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId); | ||
}else if (TTSEngine === 'Web Speech API') { | ||
return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId); | ||
} | ||
throw new Error('TTSEngine is not found'); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I love this.