feat: add tts stt
This commit is contained in:
parent
718782f5b1
commit
2f410fc09f
|
@ -20,6 +20,7 @@ export const ROLES = ["system", "user", "assistant"] as const;
|
||||||
export type MessageRole = (typeof ROLES)[number];
|
export type MessageRole = (typeof ROLES)[number];
|
||||||
|
|
||||||
export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
|
export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
|
||||||
|
export const TTSModels = ["tts-1", "tts-1-hd"] as const;
|
||||||
export type ChatModel = ModelType;
|
export type ChatModel = ModelType;
|
||||||
|
|
||||||
export interface MultimodalContent {
|
export interface MultimodalContent {
|
||||||
|
@ -48,6 +49,25 @@ export interface LLMConfig {
|
||||||
style?: DalleRequestPayload["style"];
|
style?: DalleRequestPayload["style"];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface SpeechOptions {
|
||||||
|
model: string;
|
||||||
|
input: string;
|
||||||
|
voice: string;
|
||||||
|
response_format?: string;
|
||||||
|
speed?: number;
|
||||||
|
onController?: (controller: AbortController) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TranscriptionOptions {
|
||||||
|
model?: "whisper-1";
|
||||||
|
file: Blob;
|
||||||
|
language?: string;
|
||||||
|
prompt?: string;
|
||||||
|
response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
|
||||||
|
temperature?: number;
|
||||||
|
onController?: (controller: AbortController) => void;
|
||||||
|
}
|
||||||
|
|
||||||
export interface ChatOptions {
|
export interface ChatOptions {
|
||||||
messages: RequestMessage[];
|
messages: RequestMessage[];
|
||||||
config: LLMConfig;
|
config: LLMConfig;
|
||||||
|
@ -80,6 +100,8 @@ export interface LLMModelProvider {
|
||||||
|
|
||||||
export abstract class LLMApi {
|
export abstract class LLMApi {
|
||||||
abstract chat(options: ChatOptions): Promise<void>;
|
abstract chat(options: ChatOptions): Promise<void>;
|
||||||
|
abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
|
||||||
|
abstract transcription(options: TranscriptionOptions): Promise<string>;
|
||||||
abstract usage(): Promise<LLMUsage>;
|
abstract usage(): Promise<LLMUsage>;
|
||||||
abstract models(): Promise<LLMModel[]>;
|
abstract models(): Promise<LLMModel[]>;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,8 @@ import {
|
||||||
LLMModel,
|
LLMModel,
|
||||||
LLMUsage,
|
LLMUsage,
|
||||||
MultimodalContent,
|
MultimodalContent,
|
||||||
|
SpeechOptions,
|
||||||
|
TranscriptionOptions,
|
||||||
} from "../api";
|
} from "../api";
|
||||||
import Locale from "../../locales";
|
import Locale from "../../locales";
|
||||||
import {
|
import {
|
||||||
|
@ -77,7 +79,7 @@ export interface DalleRequestPayload {
|
||||||
export class ChatGPTApi implements LLMApi {
|
export class ChatGPTApi implements LLMApi {
|
||||||
private disableListModels = true;
|
private disableListModels = true;
|
||||||
|
|
||||||
path(path: string): string {
|
path(path: string, model?: string): string {
|
||||||
const accessStore = useAccessStore.getState();
|
const accessStore = useAccessStore.getState();
|
||||||
|
|
||||||
let baseUrl = "";
|
let baseUrl = "";
|
||||||
|
@ -140,6 +142,85 @@ export class ChatGPTApi implements LLMApi {
|
||||||
return res.choices?.at(0)?.message?.content ?? res;
|
return res.choices?.at(0)?.message?.content ?? res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||||
|
const requestPayload = {
|
||||||
|
model: options.model,
|
||||||
|
input: options.input,
|
||||||
|
voice: options.voice,
|
||||||
|
response_format: options.response_format,
|
||||||
|
speed: options.speed,
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log("[Request] openai speech payload: ", requestPayload);
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
options.onController?.(controller);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const speechPath = this.path(OpenaiPath.SpeechPath, options.model);
|
||||||
|
const speechPayload = {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(requestPayload),
|
||||||
|
signal: controller.signal,
|
||||||
|
headers: getHeaders(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// make a fetch request
|
||||||
|
const requestTimeoutId = setTimeout(
|
||||||
|
() => controller.abort(),
|
||||||
|
REQUEST_TIMEOUT_MS,
|
||||||
|
);
|
||||||
|
|
||||||
|
const res = await fetch(speechPath, speechPayload);
|
||||||
|
clearTimeout(requestTimeoutId);
|
||||||
|
return await res.arrayBuffer();
|
||||||
|
} catch (e) {
|
||||||
|
console.log("[Request] failed to make a speech request", e);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async transcription(options: TranscriptionOptions): Promise<string> {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", options.file, "audio.wav");
|
||||||
|
formData.append("model", options.model ?? "whisper-1");
|
||||||
|
if (options.language) formData.append("language", options.language);
|
||||||
|
if (options.prompt) formData.append("prompt", options.prompt);
|
||||||
|
if (options.response_format)
|
||||||
|
formData.append("response_format", options.response_format);
|
||||||
|
if (options.temperature)
|
||||||
|
formData.append("temperature", options.temperature.toString());
|
||||||
|
|
||||||
|
console.log("[Request] openai audio transcriptions payload: ", options);
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
options.onController?.(controller);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const path = this.path(OpenaiPath.TranscriptionPath, options.model);
|
||||||
|
const headers = getHeaders(true);
|
||||||
|
const payload = {
|
||||||
|
method: "POST",
|
||||||
|
body: formData,
|
||||||
|
signal: controller.signal,
|
||||||
|
headers: headers,
|
||||||
|
};
|
||||||
|
|
||||||
|
// make a fetch request
|
||||||
|
const requestTimeoutId = setTimeout(
|
||||||
|
() => controller.abort(),
|
||||||
|
REQUEST_TIMEOUT_MS,
|
||||||
|
);
|
||||||
|
const res = await fetch(path, payload);
|
||||||
|
clearTimeout(requestTimeoutId);
|
||||||
|
const json = await res.json();
|
||||||
|
return json.text;
|
||||||
|
} catch (e) {
|
||||||
|
console.log("[Request] failed to make a audio transcriptions request", e);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async chat(options: ChatOptions) {
|
async chat(options: ChatOptions) {
|
||||||
const modelConfig = {
|
const modelConfig = {
|
||||||
...useAppConfig.getState().modelConfig,
|
...useAppConfig.getState().modelConfig,
|
||||||
|
|
|
@ -10,11 +10,14 @@ import React, {
|
||||||
} from "react";
|
} from "react";
|
||||||
|
|
||||||
import SendWhiteIcon from "../icons/send-white.svg";
|
import SendWhiteIcon from "../icons/send-white.svg";
|
||||||
|
import VoiceWhiteIcon from "../icons/voice-white.svg";
|
||||||
import BrainIcon from "../icons/brain.svg";
|
import BrainIcon from "../icons/brain.svg";
|
||||||
import RenameIcon from "../icons/rename.svg";
|
import RenameIcon from "../icons/rename.svg";
|
||||||
import ExportIcon from "../icons/share.svg";
|
import ExportIcon from "../icons/share.svg";
|
||||||
import ReturnIcon from "../icons/return.svg";
|
import ReturnIcon from "../icons/return.svg";
|
||||||
import CopyIcon from "../icons/copy.svg";
|
import CopyIcon from "../icons/copy.svg";
|
||||||
|
import SpeakIcon from "../icons/speak.svg";
|
||||||
|
import SpeakStopIcon from "../icons/speak-stop.svg";
|
||||||
import LoadingIcon from "../icons/three-dots.svg";
|
import LoadingIcon from "../icons/three-dots.svg";
|
||||||
import LoadingButtonIcon from "../icons/loading.svg";
|
import LoadingButtonIcon from "../icons/loading.svg";
|
||||||
import PromptIcon from "../icons/prompt.svg";
|
import PromptIcon from "../icons/prompt.svg";
|
||||||
|
@ -64,6 +67,7 @@ import {
|
||||||
getMessageImages,
|
getMessageImages,
|
||||||
isVisionModel,
|
isVisionModel,
|
||||||
isDalle3,
|
isDalle3,
|
||||||
|
isFirefox,
|
||||||
} from "../utils";
|
} from "../utils";
|
||||||
|
|
||||||
import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
|
import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
|
||||||
|
@ -73,7 +77,7 @@ import dynamic from "next/dynamic";
|
||||||
import { ChatControllerPool } from "../client/controller";
|
import { ChatControllerPool } from "../client/controller";
|
||||||
import { DalleSize, DalleQuality, DalleStyle } from "../typing";
|
import { DalleSize, DalleQuality, DalleStyle } from "../typing";
|
||||||
import { Prompt, usePromptStore } from "../store/prompt";
|
import { Prompt, usePromptStore } from "../store/prompt";
|
||||||
import Locale from "../locales";
|
import Locale, { getLang, getSTTLang } from "../locales";
|
||||||
|
|
||||||
import { IconButton } from "./button";
|
import { IconButton } from "./button";
|
||||||
import styles from "./chat.module.scss";
|
import styles from "./chat.module.scss";
|
||||||
|
@ -90,6 +94,10 @@ import {
|
||||||
import { useNavigate } from "react-router-dom";
|
import { useNavigate } from "react-router-dom";
|
||||||
import {
|
import {
|
||||||
CHAT_PAGE_SIZE,
|
CHAT_PAGE_SIZE,
|
||||||
|
DEFAULT_STT_ENGINE,
|
||||||
|
DEFAULT_TTS_ENGINE,
|
||||||
|
FIREFOX_DEFAULT_STT_ENGINE,
|
||||||
|
ModelProvider,
|
||||||
LAST_INPUT_KEY,
|
LAST_INPUT_KEY,
|
||||||
Path,
|
Path,
|
||||||
REQUEST_TIMEOUT_MS,
|
REQUEST_TIMEOUT_MS,
|
||||||
|
@ -106,6 +114,16 @@ import { ExportMessageModal } from "./exporter";
|
||||||
import { getClientConfig } from "../config/client";
|
import { getClientConfig } from "../config/client";
|
||||||
import { useAllModels } from "../utils/hooks";
|
import { useAllModels } from "../utils/hooks";
|
||||||
import { MultimodalContent } from "../client/api";
|
import { MultimodalContent } from "../client/api";
|
||||||
|
import { ClientApi } from "../client/api";
|
||||||
|
import { createTTSPlayer } from "../utils/audio";
|
||||||
|
import {
|
||||||
|
OpenAITranscriptionApi,
|
||||||
|
SpeechApi,
|
||||||
|
WebTranscriptionApi,
|
||||||
|
} from "../utils/speech";
|
||||||
|
import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
|
||||||
|
|
||||||
|
const ttsPlayer = createTTSPlayer();
|
||||||
|
|
||||||
const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
|
const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
|
||||||
loading: () => <LoadingIcon />,
|
loading: () => <LoadingIcon />,
|
||||||
|
@ -922,6 +940,33 @@ function _Chat() {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const [isListening, setIsListening] = useState(false);
|
||||||
|
const [isTranscription, setIsTranscription] = useState(false);
|
||||||
|
const [speechApi, setSpeechApi] = useState<any>(null);
|
||||||
|
|
||||||
|
const startListening = async () => {
|
||||||
|
if (speechApi) {
|
||||||
|
await speechApi.start();
|
||||||
|
setIsListening(true);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const stopListening = async () => {
|
||||||
|
if (speechApi) {
|
||||||
|
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||||
|
setIsTranscription(true);
|
||||||
|
await speechApi.stop();
|
||||||
|
setIsListening(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const onRecognitionEnd = (finalTranscript: string) => {
|
||||||
|
console.log(finalTranscript);
|
||||||
|
if (finalTranscript) setUserInput(finalTranscript);
|
||||||
|
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||||
|
setIsTranscription(false);
|
||||||
|
};
|
||||||
|
|
||||||
const doSubmit = (userInput: string) => {
|
const doSubmit = (userInput: string) => {
|
||||||
if (userInput.trim() === "") return;
|
if (userInput.trim() === "") return;
|
||||||
const matchCommand = chatCommands.match(userInput);
|
const matchCommand = chatCommands.match(userInput);
|
||||||
|
@ -992,6 +1037,16 @@ function _Chat() {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
|
if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
|
||||||
|
setSpeechApi(
|
||||||
|
config.sttConfig.engine === DEFAULT_STT_ENGINE
|
||||||
|
? new WebTranscriptionApi((transcription) =>
|
||||||
|
onRecognitionEnd(transcription),
|
||||||
|
)
|
||||||
|
: new OpenAITranscriptionApi((transcription) =>
|
||||||
|
onRecognitionEnd(transcription),
|
||||||
|
),
|
||||||
|
);
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
// check if should send message
|
// check if should send message
|
||||||
|
@ -1102,10 +1157,55 @@ function _Chat() {
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const accessStore = useAccessStore();
|
||||||
|
const [speechStatus, setSpeechStatus] = useState(false);
|
||||||
|
const [speechLoading, setSpeechLoading] = useState(false);
|
||||||
|
async function openaiSpeech(text: string) {
|
||||||
|
if (speechStatus) {
|
||||||
|
ttsPlayer.stop();
|
||||||
|
setSpeechStatus(false);
|
||||||
|
} else {
|
||||||
|
var api: ClientApi;
|
||||||
|
api = new ClientApi(ModelProvider.GPT);
|
||||||
|
const config = useAppConfig.getState();
|
||||||
|
setSpeechLoading(true);
|
||||||
|
ttsPlayer.init();
|
||||||
|
let audioBuffer: ArrayBuffer;
|
||||||
|
const { markdownToTxt } = require("markdown-to-txt");
|
||||||
|
const textContent = markdownToTxt(text);
|
||||||
|
if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
|
||||||
|
const edgeVoiceName = accessStore.edgeVoiceName();
|
||||||
|
const tts = new MsEdgeTTS();
|
||||||
|
await tts.setMetadata(
|
||||||
|
edgeVoiceName,
|
||||||
|
OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
|
||||||
|
);
|
||||||
|
audioBuffer = await tts.toArrayBuffer(textContent);
|
||||||
|
} else {
|
||||||
|
audioBuffer = await api.llm.speech({
|
||||||
|
model: config.ttsConfig.model,
|
||||||
|
input: textContent,
|
||||||
|
voice: config.ttsConfig.voice,
|
||||||
|
speed: config.ttsConfig.speed,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
setSpeechStatus(true);
|
||||||
|
ttsPlayer
|
||||||
|
.play(audioBuffer, () => {
|
||||||
|
setSpeechStatus(false);
|
||||||
|
})
|
||||||
|
.catch((e) => {
|
||||||
|
console.error("[OpenAI Speech]", e);
|
||||||
|
showToast(prettyObject(e));
|
||||||
|
setSpeechStatus(false);
|
||||||
|
})
|
||||||
|
.finally(() => setSpeechLoading(false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const context: RenderMessage[] = useMemo(() => {
|
const context: RenderMessage[] = useMemo(() => {
|
||||||
return session.mask.hideContext ? [] : session.mask.context.slice();
|
return session.mask.hideContext ? [] : session.mask.context.slice();
|
||||||
}, [session.mask.context, session.mask.hideContext]);
|
}, [session.mask.context, session.mask.hideContext]);
|
||||||
const accessStore = useAccessStore();
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
context.length === 0 &&
|
context.length === 0 &&
|
||||||
|
@ -1567,6 +1667,26 @@ function _Chat() {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
|
{config.ttsConfig.enable && (
|
||||||
|
<ChatAction
|
||||||
|
text={
|
||||||
|
speechStatus
|
||||||
|
? Locale.Chat.Actions.StopSpeech
|
||||||
|
: Locale.Chat.Actions.Speech
|
||||||
|
}
|
||||||
|
loding={speechLoading}
|
||||||
|
icon={
|
||||||
|
speechStatus ? (
|
||||||
|
<SpeakStopIcon />
|
||||||
|
) : (
|
||||||
|
<SpeakIcon />
|
||||||
|
)
|
||||||
|
}
|
||||||
|
onClick={() =>
|
||||||
|
openaiSpeech(getMessageTextContent(message))
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
@ -1714,13 +1834,35 @@ function _Chat() {
|
||||||
})}
|
})}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
<IconButton
|
{config.sttConfig.enable ? (
|
||||||
|
<IconButton
|
||||||
|
icon={<VoiceWhiteIcon />}
|
||||||
|
text={
|
||||||
|
isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak
|
||||||
|
}
|
||||||
|
className={styles["chat-input-send"]}
|
||||||
|
type="primary"
|
||||||
|
onClick={async () =>
|
||||||
|
isListening ? await stopListening() : await startListening()
|
||||||
|
}
|
||||||
|
loding={isTranscription}
|
||||||
|
/>
|
||||||
|
) : (
|
||||||
|
<IconButton
|
||||||
|
icon={<SendWhiteIcon />}
|
||||||
|
text={Locale.Chat.Send}
|
||||||
|
className={styles["chat-input-send"]}
|
||||||
|
type="primary"
|
||||||
|
onClick={() => doSubmit(userInput)}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
{/* <IconButton
|
||||||
icon={<SendWhiteIcon />}
|
icon={<SendWhiteIcon />}
|
||||||
text={Locale.Chat.Send}
|
text={Locale.Chat.Send}
|
||||||
className={styles["chat-input-send"]}
|
className={styles["chat-input-send"]}
|
||||||
type="primary"
|
type="primary"
|
||||||
onClick={() => doSubmit(userInput)}
|
onClick={() => doSubmit(userInput)}
|
||||||
/>
|
/> */}
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
@ -80,6 +80,8 @@ import { useSyncStore } from "../store/sync";
|
||||||
import { nanoid } from "nanoid";
|
import { nanoid } from "nanoid";
|
||||||
import { useMaskStore } from "../store/mask";
|
import { useMaskStore } from "../store/mask";
|
||||||
import { ProviderType } from "../utils/cloud";
|
import { ProviderType } from "../utils/cloud";
|
||||||
|
import { TTSConfigList } from "./tts-config";
|
||||||
|
import { STTConfigList } from "./stt-config";
|
||||||
|
|
||||||
function EditPromptModal(props: { id: string; onClose: () => void }) {
|
function EditPromptModal(props: { id: string; onClose: () => void }) {
|
||||||
const promptStore = usePromptStore();
|
const promptStore = usePromptStore();
|
||||||
|
@ -1646,6 +1648,28 @@ export function Settings() {
|
||||||
<UserPromptModal onClose={() => setShowPromptModal(false)} />
|
<UserPromptModal onClose={() => setShowPromptModal(false)} />
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
<List>
|
||||||
|
<TTSConfigList
|
||||||
|
ttsConfig={config.ttsConfig}
|
||||||
|
updateConfig={(updater) => {
|
||||||
|
const ttsConfig = { ...config.ttsConfig };
|
||||||
|
updater(ttsConfig);
|
||||||
|
config.update((config) => (config.ttsConfig = ttsConfig));
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</List>
|
||||||
|
|
||||||
|
<List>
|
||||||
|
<STTConfigList
|
||||||
|
sttConfig={config.sttConfig}
|
||||||
|
updateConfig={(updater) => {
|
||||||
|
const sttConfig = { ...config.sttConfig };
|
||||||
|
updater(sttConfig);
|
||||||
|
config.update((config) => (config.sttConfig = sttConfig));
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</List>
|
||||||
|
|
||||||
<DangerItems />
|
<DangerItems />
|
||||||
</div>
|
</div>
|
||||||
</ErrorBoundary>
|
</ErrorBoundary>
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
import { STTConfig, STTConfigValidator } from "../store";
|
||||||
|
|
||||||
|
import Locale from "../locales";
|
||||||
|
import { ListItem, Select } from "./ui-lib";
|
||||||
|
import { DEFAULT_STT_ENGINES } from "../constant";
|
||||||
|
import { isFirefox } from "../utils";
|
||||||
|
|
||||||
|
export function STTConfigList(props: {
|
||||||
|
sttConfig: STTConfig;
|
||||||
|
updateConfig: (updater: (config: STTConfig) => void) => void;
|
||||||
|
}) {
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ListItem
|
||||||
|
title={Locale.Settings.STT.Enable.Title}
|
||||||
|
subTitle={Locale.Settings.STT.Enable.SubTitle}
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={props.sttConfig.enable}
|
||||||
|
onChange={(e) =>
|
||||||
|
props.updateConfig(
|
||||||
|
(config) => (config.enable = e.currentTarget.checked),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
></input>
|
||||||
|
</ListItem>
|
||||||
|
{!isFirefox() && (
|
||||||
|
<ListItem title={Locale.Settings.STT.Engine.Title}>
|
||||||
|
<Select
|
||||||
|
value={props.sttConfig.engine}
|
||||||
|
onChange={(e) => {
|
||||||
|
props.updateConfig(
|
||||||
|
(config) =>
|
||||||
|
(config.engine = STTConfigValidator.engine(
|
||||||
|
e.currentTarget.value,
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{DEFAULT_STT_ENGINES.map((v, i) => (
|
||||||
|
<option value={v} key={i}>
|
||||||
|
{v}
|
||||||
|
</option>
|
||||||
|
))}
|
||||||
|
</Select>
|
||||||
|
</ListItem>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
@import "../styles/animation.scss";
|
||||||
|
.plugin-page {
|
||||||
|
height: 100%;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
|
||||||
|
.plugin-page-body {
|
||||||
|
padding: 20px;
|
||||||
|
overflow-y: auto;
|
||||||
|
|
||||||
|
.plugin-filter {
|
||||||
|
width: 100%;
|
||||||
|
max-width: 100%;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
animation: slide-in ease 0.3s;
|
||||||
|
height: 40px;
|
||||||
|
|
||||||
|
display: flex;
|
||||||
|
|
||||||
|
.search-bar {
|
||||||
|
flex-grow: 1;
|
||||||
|
max-width: 100%;
|
||||||
|
min-width: 0;
|
||||||
|
outline: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.search-bar:focus {
|
||||||
|
border: 1px solid var(--primary);
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-filter-lang {
|
||||||
|
height: 100%;
|
||||||
|
margin-left: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-create {
|
||||||
|
height: 100%;
|
||||||
|
margin-left: 10px;
|
||||||
|
box-sizing: border-box;
|
||||||
|
min-width: 80px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-item {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 20px;
|
||||||
|
border: var(--border-in-light);
|
||||||
|
animation: slide-in ease 0.3s;
|
||||||
|
|
||||||
|
&:not(:last-child) {
|
||||||
|
border-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
&:first-child {
|
||||||
|
border-top-left-radius: 10px;
|
||||||
|
border-top-right-radius: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
&:last-child {
|
||||||
|
border-bottom-left-radius: 10px;
|
||||||
|
border-bottom-right-radius: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
|
||||||
|
.plugin-icon {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
margin-right: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-title {
|
||||||
|
.plugin-name {
|
||||||
|
font-size: 14px;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.plugin-info {
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
.plugin-runtime-warning {
|
||||||
|
font-size: 12px;
|
||||||
|
color: #f86c6c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-actions {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: nowrap;
|
||||||
|
transition: all ease 0.3s;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media screen and (max-width: 600px) {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-radius: 10px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
box-shadow: var(--card-shadow);
|
||||||
|
|
||||||
|
&:not(:last-child) {
|
||||||
|
border-bottom: var(--border-in-light);
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-actions {
|
||||||
|
width: 100%;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding-top: 10px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
|
||||||
|
|
||||||
|
import Locale from "../locales";
|
||||||
|
import { ListItem, Select } from "./ui-lib";
|
||||||
|
import {
|
||||||
|
DEFAULT_TTS_ENGINE,
|
||||||
|
DEFAULT_TTS_ENGINES,
|
||||||
|
DEFAULT_TTS_MODELS,
|
||||||
|
DEFAULT_TTS_VOICES,
|
||||||
|
} from "../constant";
|
||||||
|
import { InputRange } from "./input-range";
|
||||||
|
|
||||||
|
export function TTSConfigList(props: {
|
||||||
|
ttsConfig: TTSConfig;
|
||||||
|
updateConfig: (updater: (config: TTSConfig) => void) => void;
|
||||||
|
}) {
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ListItem
|
||||||
|
title={Locale.Settings.TTS.Enable.Title}
|
||||||
|
subTitle={Locale.Settings.TTS.Enable.SubTitle}
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={props.ttsConfig.enable}
|
||||||
|
onChange={(e) =>
|
||||||
|
props.updateConfig(
|
||||||
|
(config) => (config.enable = e.currentTarget.checked),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
></input>
|
||||||
|
</ListItem>
|
||||||
|
{/* <ListItem
|
||||||
|
title={Locale.Settings.TTS.Autoplay.Title}
|
||||||
|
subTitle={Locale.Settings.TTS.Autoplay.SubTitle}
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={props.ttsConfig.autoplay}
|
||||||
|
onChange={(e) =>
|
||||||
|
props.updateConfig(
|
||||||
|
(config) => (config.autoplay = e.currentTarget.checked),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
></input>
|
||||||
|
</ListItem> */}
|
||||||
|
<ListItem title={Locale.Settings.TTS.Engine}>
|
||||||
|
<Select
|
||||||
|
value={props.ttsConfig.engine}
|
||||||
|
onChange={(e) => {
|
||||||
|
props.updateConfig(
|
||||||
|
(config) =>
|
||||||
|
(config.engine = TTSConfigValidator.engine(
|
||||||
|
e.currentTarget.value,
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{DEFAULT_TTS_ENGINES.map((v, i) => (
|
||||||
|
<option value={v} key={i}>
|
||||||
|
{v}
|
||||||
|
</option>
|
||||||
|
))}
|
||||||
|
</Select>
|
||||||
|
</ListItem>
|
||||||
|
{props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
|
||||||
|
<>
|
||||||
|
<ListItem title={Locale.Settings.TTS.Model}>
|
||||||
|
<Select
|
||||||
|
value={props.ttsConfig.model}
|
||||||
|
onChange={(e) => {
|
||||||
|
props.updateConfig(
|
||||||
|
(config) =>
|
||||||
|
(config.model = TTSConfigValidator.model(
|
||||||
|
e.currentTarget.value,
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{DEFAULT_TTS_MODELS.map((v, i) => (
|
||||||
|
<option value={v} key={i}>
|
||||||
|
{v}
|
||||||
|
</option>
|
||||||
|
))}
|
||||||
|
</Select>
|
||||||
|
</ListItem>
|
||||||
|
<ListItem
|
||||||
|
title={Locale.Settings.TTS.Voice.Title}
|
||||||
|
subTitle={Locale.Settings.TTS.Voice.SubTitle}
|
||||||
|
>
|
||||||
|
<Select
|
||||||
|
value={props.ttsConfig.voice}
|
||||||
|
onChange={(e) => {
|
||||||
|
props.updateConfig(
|
||||||
|
(config) =>
|
||||||
|
(config.voice = TTSConfigValidator.voice(
|
||||||
|
e.currentTarget.value,
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{DEFAULT_TTS_VOICES.map((v, i) => (
|
||||||
|
<option value={v} key={i}>
|
||||||
|
{v}
|
||||||
|
</option>
|
||||||
|
))}
|
||||||
|
</Select>
|
||||||
|
</ListItem>
|
||||||
|
<ListItem
|
||||||
|
title={Locale.Settings.TTS.Speed.Title}
|
||||||
|
subTitle={Locale.Settings.TTS.Speed.SubTitle}
|
||||||
|
>
|
||||||
|
<InputRange
|
||||||
|
value={props.ttsConfig.speed?.toFixed(1)}
|
||||||
|
min="0.3"
|
||||||
|
max="4.0"
|
||||||
|
step="0.1"
|
||||||
|
onChange={(e) => {
|
||||||
|
props.updateConfig(
|
||||||
|
(config) =>
|
||||||
|
(config.speed = TTSConfigValidator.speed(
|
||||||
|
e.currentTarget.valueAsNumber,
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
></InputRange>
|
||||||
|
</ListItem>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
@import "../styles/animation.scss";
|
||||||
|
.plugin-page {
|
||||||
|
height: 100%;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
|
||||||
|
.plugin-page-body {
|
||||||
|
padding: 20px;
|
||||||
|
overflow-y: auto;
|
||||||
|
|
||||||
|
.plugin-filter {
|
||||||
|
width: 100%;
|
||||||
|
max-width: 100%;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
animation: slide-in ease 0.3s;
|
||||||
|
height: 40px;
|
||||||
|
|
||||||
|
display: flex;
|
||||||
|
|
||||||
|
.search-bar {
|
||||||
|
flex-grow: 1;
|
||||||
|
max-width: 100%;
|
||||||
|
min-width: 0;
|
||||||
|
outline: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.search-bar:focus {
|
||||||
|
border: 1px solid var(--primary);
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-filter-lang {
|
||||||
|
height: 100%;
|
||||||
|
margin-left: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-create {
|
||||||
|
height: 100%;
|
||||||
|
margin-left: 10px;
|
||||||
|
box-sizing: border-box;
|
||||||
|
min-width: 80px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-item {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 20px;
|
||||||
|
border: var(--border-in-light);
|
||||||
|
animation: slide-in ease 0.3s;
|
||||||
|
|
||||||
|
&:not(:last-child) {
|
||||||
|
border-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
&:first-child {
|
||||||
|
border-top-left-radius: 10px;
|
||||||
|
border-top-right-radius: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
&:last-child {
|
||||||
|
border-bottom-left-radius: 10px;
|
||||||
|
border-bottom-right-radius: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
|
||||||
|
.plugin-icon {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
margin-right: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-title {
|
||||||
|
.plugin-name {
|
||||||
|
font-size: 14px;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.plugin-info {
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
.plugin-runtime-warning {
|
||||||
|
font-size: 12px;
|
||||||
|
color: #f86c6c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-actions {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: nowrap;
|
||||||
|
transition: all ease 0.3s;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media screen and (max-width: 600px) {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-radius: 10px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
box-shadow: var(--card-shadow);
|
||||||
|
|
||||||
|
&:not(:last-child) {
|
||||||
|
border-bottom: var(--border-in-light);
|
||||||
|
}
|
||||||
|
|
||||||
|
.plugin-actions {
|
||||||
|
width: 100%;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding-top: 10px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -153,6 +153,8 @@ export const Anthropic = {
|
||||||
|
|
||||||
export const OpenaiPath = {
|
export const OpenaiPath = {
|
||||||
ChatPath: "v1/chat/completions",
|
ChatPath: "v1/chat/completions",
|
||||||
|
SpeechPath: "v1/audio/speech",
|
||||||
|
TranscriptionPath: "v1/audio/transcriptions",
|
||||||
ImagePath: "v1/images/generations",
|
ImagePath: "v1/images/generations",
|
||||||
UsagePath: "dashboard/billing/usage",
|
UsagePath: "dashboard/billing/usage",
|
||||||
SubsPath: "dashboard/billing/subscription",
|
SubsPath: "dashboard/billing/subscription",
|
||||||
|
@ -256,6 +258,24 @@ export const KnowledgeCutOffDate: Record<string, string> = {
|
||||||
"gemini-pro-vision": "2023-12",
|
"gemini-pro-vision": "2023-12",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
|
||||||
|
export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
|
||||||
|
export const DEFAULT_TTS_MODEL = "tts-1";
|
||||||
|
export const DEFAULT_TTS_VOICE = "alloy";
|
||||||
|
export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
|
||||||
|
export const DEFAULT_TTS_VOICES = [
|
||||||
|
"alloy",
|
||||||
|
"echo",
|
||||||
|
"fable",
|
||||||
|
"onyx",
|
||||||
|
"nova",
|
||||||
|
"shimmer",
|
||||||
|
];
|
||||||
|
|
||||||
|
export const DEFAULT_STT_ENGINE = "WebAPI";
|
||||||
|
export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"];
|
||||||
|
export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
|
||||||
|
|
||||||
const openaiModels = [
|
const openaiModels = [
|
||||||
"gpt-3.5-turbo",
|
"gpt-3.5-turbo",
|
||||||
"gpt-3.5-turbo-1106",
|
"gpt-3.5-turbo-1106",
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M17.25 9.75 19.5 12m0 0 2.25 2.25M19.5 12l2.25-2.25M19.5 12l-2.25 2.25m-10.5-6 4.72-4.72a.75.75 0 0 1 1.28.53v15.88a.75.75 0 0 1-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.009 9.009 0 0 1 2.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75Z"></path></svg>
|
After Width: | Height: | Size: 495 B |
|
@ -0,0 +1 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M19.114 5.636a9 9 0 010 12.728M16.463 8.288a5.25 5.25 0 010 7.424M6.75 8.25l4.72-4.72a.75.75 0 011.28.53v15.88a.75.75 0 01-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.01 9.01 0 012.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75z"></path></svg>
|
After Width: | Height: | Size: 485 B |
|
@ -0,0 +1,16 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="20" height="20" fill="none" viewBox="0 0 20 20">
|
||||||
|
<defs>
|
||||||
|
<rect id="path_0" width="20" height="20" x="0" y="0" />
|
||||||
|
</defs>
|
||||||
|
<g opacity="1" transform="translate(0 0) rotate(0 8 8)">
|
||||||
|
<mask id="bg-mask-0" fill="#fff">
|
||||||
|
<use xlink:href="#path_0" />
|
||||||
|
</mask>
|
||||||
|
<g mask="url(#bg-mask-0)">
|
||||||
|
<path d="M7 4a3 3 0 016 0v6a3 3 0 11-6 0V4z">
|
||||||
|
</path>
|
||||||
|
<path d="M5.5 9.643a.75.75 0 00-1.5 0V10c0 3.06 2.29 5.585 5.25 5.954V17.5h-1.5a.75.75 0 000 1.5h4.5a.75.75 0 000-1.5h-1.5v-1.546A6.001 6.001 0 0016 10v-.357a.75.75 0 00-1.5 0V10a4.5 4.5 0 01-9 0v-.357z">
|
||||||
|
</path>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 678 B |
|
@ -43,6 +43,8 @@ const cn = {
|
||||||
Delete: "删除",
|
Delete: "删除",
|
||||||
Edit: "编辑",
|
Edit: "编辑",
|
||||||
FullScreen: "全屏",
|
FullScreen: "全屏",
|
||||||
|
Speech: "朗读",
|
||||||
|
StopSpeech: "停止",
|
||||||
},
|
},
|
||||||
Commands: {
|
Commands: {
|
||||||
new: "新建聊天",
|
new: "新建聊天",
|
||||||
|
@ -76,6 +78,8 @@ const cn = {
|
||||||
return inputHints + ",/ 触发补全,: 触发命令";
|
return inputHints + ",/ 触发补全,: 触发命令";
|
||||||
},
|
},
|
||||||
Send: "发送",
|
Send: "发送",
|
||||||
|
StartSpeak: "说话",
|
||||||
|
StopSpeak: "停止",
|
||||||
Config: {
|
Config: {
|
||||||
Reset: "清除记忆",
|
Reset: "清除记忆",
|
||||||
SaveAs: "存为面具",
|
SaveAs: "存为面具",
|
||||||
|
@ -481,6 +485,36 @@ const cn = {
|
||||||
Title: "频率惩罚度 (frequency_penalty)",
|
Title: "频率惩罚度 (frequency_penalty)",
|
||||||
SubTitle: "值越大,越有可能降低重复字词",
|
SubTitle: "值越大,越有可能降低重复字词",
|
||||||
},
|
},
|
||||||
|
TTS: {
|
||||||
|
Enable: {
|
||||||
|
Title: "启用文本转语音",
|
||||||
|
SubTitle: "启用文本生成语音服务",
|
||||||
|
},
|
||||||
|
Autoplay: {
|
||||||
|
Title: "启用自动朗读",
|
||||||
|
SubTitle: "自动生成语音并播放,需先开启文本转语音开关",
|
||||||
|
},
|
||||||
|
Model: "模型",
|
||||||
|
Engine: "转换引擎",
|
||||||
|
Voice: {
|
||||||
|
Title: "声音",
|
||||||
|
SubTitle: "生成语音时使用的声音",
|
||||||
|
},
|
||||||
|
Speed: {
|
||||||
|
Title: "速度",
|
||||||
|
SubTitle: "生成语音的速度",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
STT: {
|
||||||
|
Enable: {
|
||||||
|
Title: "启用语音转文本",
|
||||||
|
SubTitle: "启用语音转文本",
|
||||||
|
},
|
||||||
|
Engine: {
|
||||||
|
Title: "转换引擎",
|
||||||
|
SubTitle: "音频转换引擎",
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
Store: {
|
Store: {
|
||||||
DefaultTopic: "新的聊天",
|
DefaultTopic: "新的聊天",
|
||||||
|
|
|
@ -45,6 +45,8 @@ const en: LocaleType = {
|
||||||
Delete: "Delete",
|
Delete: "Delete",
|
||||||
Edit: "Edit",
|
Edit: "Edit",
|
||||||
FullScreen: "FullScreen",
|
FullScreen: "FullScreen",
|
||||||
|
Speech: "Play",
|
||||||
|
StopSpeech: "Stop",
|
||||||
},
|
},
|
||||||
Commands: {
|
Commands: {
|
||||||
new: "Start a new chat",
|
new: "Start a new chat",
|
||||||
|
|
|
@ -137,3 +137,34 @@ export function getISOLang() {
|
||||||
const lang = getLang();
|
const lang = getLang();
|
||||||
return isoLangString[lang] ?? lang;
|
return isoLangString[lang] ?? lang;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const DEFAULT_STT_LANG = "zh-CN";
|
||||||
|
export const STT_LANG_MAP: Record<Lang, string> = {
|
||||||
|
cn: "zh-CN",
|
||||||
|
en: "en-US",
|
||||||
|
pt: "pt-BR",
|
||||||
|
tw: "zh-TW",
|
||||||
|
jp: "ja-JP",
|
||||||
|
ko: "ko-KR",
|
||||||
|
id: "id-ID",
|
||||||
|
fr: "fr-FR",
|
||||||
|
es: "es-ES",
|
||||||
|
it: "it-IT",
|
||||||
|
tr: "tr-TR",
|
||||||
|
de: "de-DE",
|
||||||
|
vi: "vi-VN",
|
||||||
|
ru: "ru-RU",
|
||||||
|
cs: "cs-CZ",
|
||||||
|
no: "no-NO",
|
||||||
|
ar: "ar-SA",
|
||||||
|
bn: "bn-BD",
|
||||||
|
sk: "sk-SK",
|
||||||
|
};
|
||||||
|
|
||||||
|
export function getSTTLang(): string {
|
||||||
|
try {
|
||||||
|
return STT_LANG_MAP[getLang()];
|
||||||
|
} catch {
|
||||||
|
return DEFAULT_STT_LANG;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -120,6 +120,9 @@ const DEFAULT_ACCESS_STATE = {
|
||||||
disableFastLink: false,
|
disableFastLink: false,
|
||||||
customModels: "",
|
customModels: "",
|
||||||
defaultModel: "",
|
defaultModel: "",
|
||||||
|
|
||||||
|
// tts config
|
||||||
|
edgeTTSVoiceName: "zh-CN-YunxiNeural",
|
||||||
};
|
};
|
||||||
|
|
||||||
export const useAccessStore = createPersistStore(
|
export const useAccessStore = createPersistStore(
|
||||||
|
@ -132,6 +135,12 @@ export const useAccessStore = createPersistStore(
|
||||||
return get().needCode;
|
return get().needCode;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
edgeVoiceName() {
|
||||||
|
this.fetch();
|
||||||
|
|
||||||
|
return get().edgeTTSVoiceName;
|
||||||
|
},
|
||||||
|
|
||||||
isValidOpenAI() {
|
isValidOpenAI() {
|
||||||
return ensure(get(), ["openaiApiKey"]);
|
return ensure(get(), ["openaiApiKey"]);
|
||||||
},
|
},
|
||||||
|
|
|
@ -5,12 +5,25 @@ import {
|
||||||
DEFAULT_INPUT_TEMPLATE,
|
DEFAULT_INPUT_TEMPLATE,
|
||||||
DEFAULT_MODELS,
|
DEFAULT_MODELS,
|
||||||
DEFAULT_SIDEBAR_WIDTH,
|
DEFAULT_SIDEBAR_WIDTH,
|
||||||
|
DEFAULT_STT_ENGINE,
|
||||||
|
DEFAULT_STT_ENGINES,
|
||||||
|
DEFAULT_TTS_ENGINE,
|
||||||
|
DEFAULT_TTS_ENGINES,
|
||||||
|
DEFAULT_TTS_MODEL,
|
||||||
|
DEFAULT_TTS_MODELS,
|
||||||
|
DEFAULT_TTS_VOICE,
|
||||||
|
DEFAULT_TTS_VOICES,
|
||||||
StoreKey,
|
StoreKey,
|
||||||
ServiceProvider,
|
ServiceProvider,
|
||||||
} from "../constant";
|
} from "../constant";
|
||||||
import { createPersistStore } from "../utils/store";
|
import { createPersistStore } from "../utils/store";
|
||||||
|
|
||||||
export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
|
export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
|
||||||
|
export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
|
||||||
|
export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
|
||||||
|
export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
|
||||||
|
|
||||||
|
export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
|
||||||
|
|
||||||
export enum SubmitKey {
|
export enum SubmitKey {
|
||||||
Enter = "Enter",
|
Enter = "Enter",
|
||||||
|
@ -66,11 +79,26 @@ export const DEFAULT_CONFIG = {
|
||||||
quality: "standard" as DalleQuality,
|
quality: "standard" as DalleQuality,
|
||||||
style: "vivid" as DalleStyle,
|
style: "vivid" as DalleStyle,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
ttsConfig: {
|
||||||
|
enable: false,
|
||||||
|
autoplay: false,
|
||||||
|
engine: DEFAULT_TTS_ENGINE,
|
||||||
|
model: DEFAULT_TTS_MODEL,
|
||||||
|
voice: DEFAULT_TTS_VOICE,
|
||||||
|
speed: 1.0,
|
||||||
|
},
|
||||||
|
sttConfig: {
|
||||||
|
enable: false,
|
||||||
|
engine: DEFAULT_STT_ENGINE,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ChatConfig = typeof DEFAULT_CONFIG;
|
export type ChatConfig = typeof DEFAULT_CONFIG;
|
||||||
|
|
||||||
export type ModelConfig = ChatConfig["modelConfig"];
|
export type ModelConfig = ChatConfig["modelConfig"];
|
||||||
|
export type TTSConfig = ChatConfig["ttsConfig"];
|
||||||
|
export type STTConfig = ChatConfig["sttConfig"];
|
||||||
|
|
||||||
export function limitNumber(
|
export function limitNumber(
|
||||||
x: number,
|
x: number,
|
||||||
|
@ -85,6 +113,27 @@ export function limitNumber(
|
||||||
return Math.min(max, Math.max(min, x));
|
return Math.min(max, Math.max(min, x));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const TTSConfigValidator = {
|
||||||
|
engine(x: string) {
|
||||||
|
return x as TTSEngineType;
|
||||||
|
},
|
||||||
|
model(x: string) {
|
||||||
|
return x as TTSModelType;
|
||||||
|
},
|
||||||
|
voice(x: string) {
|
||||||
|
return x as TTSVoiceType;
|
||||||
|
},
|
||||||
|
speed(x: number) {
|
||||||
|
return limitNumber(x, 0.25, 4.0, 1.0);
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const STTConfigValidator = {
|
||||||
|
engine(x: string) {
|
||||||
|
return x as STTEngineType;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
export const ModalConfigValidator = {
|
export const ModalConfigValidator = {
|
||||||
model(x: string) {
|
model(x: string) {
|
||||||
return x as ModelType;
|
return x as ModelType;
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
type TTSPlayer = {
|
||||||
|
init: () => void;
|
||||||
|
play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
|
||||||
|
stop: () => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
export function createTTSPlayer(): TTSPlayer {
|
||||||
|
let audioContext: AudioContext | null = null;
|
||||||
|
let audioBufferSourceNode: AudioBufferSourceNode | null = null;
|
||||||
|
|
||||||
|
const init = () => {
|
||||||
|
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||||
|
audioContext.suspend();
|
||||||
|
};
|
||||||
|
|
||||||
|
const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
|
||||||
|
if (audioBufferSourceNode) {
|
||||||
|
audioBufferSourceNode.stop();
|
||||||
|
audioBufferSourceNode.disconnect();
|
||||||
|
}
|
||||||
|
|
||||||
|
const buffer = await audioContext!.decodeAudioData(audioBuffer);
|
||||||
|
audioBufferSourceNode = audioContext!.createBufferSource();
|
||||||
|
audioBufferSourceNode.buffer = buffer;
|
||||||
|
audioBufferSourceNode.connect(audioContext!.destination);
|
||||||
|
audioContext!.resume().then(() => {
|
||||||
|
audioBufferSourceNode!.start();
|
||||||
|
});
|
||||||
|
audioBufferSourceNode.onended = onended;
|
||||||
|
};
|
||||||
|
|
||||||
|
const stop = () => {
|
||||||
|
if (audioBufferSourceNode) {
|
||||||
|
audioBufferSourceNode.stop();
|
||||||
|
audioBufferSourceNode.disconnect();
|
||||||
|
audioBufferSourceNode = null;
|
||||||
|
}
|
||||||
|
if (audioContext) {
|
||||||
|
audioContext.close();
|
||||||
|
audioContext = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return { init, play, stop };
|
||||||
|
}
|
|
@ -0,0 +1,391 @@
|
||||||
|
// import axios from "axios";
|
||||||
|
import { Buffer } from "buffer";
|
||||||
|
import { randomBytes } from "crypto";
|
||||||
|
import { Readable } from "stream";
|
||||||
|
|
||||||
|
// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
|
||||||
|
*/
|
||||||
|
export enum VOLUME {
|
||||||
|
SILENT = "silent",
|
||||||
|
X_SOFT = "x-soft",
|
||||||
|
SOFT = "soft",
|
||||||
|
MEDIUM = "medium",
|
||||||
|
LOUD = "loud",
|
||||||
|
X_LOUD = "x-LOUD",
|
||||||
|
DEFAULT = "default",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
|
||||||
|
*/
|
||||||
|
export enum RATE {
|
||||||
|
X_SLOW = "x-slow",
|
||||||
|
SLOW = "slow",
|
||||||
|
MEDIUM = "medium",
|
||||||
|
FAST = "fast",
|
||||||
|
X_FAST = "x-fast",
|
||||||
|
DEFAULT = "default",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
|
||||||
|
*/
|
||||||
|
export enum PITCH {
|
||||||
|
X_LOW = "x-low",
|
||||||
|
LOW = "low",
|
||||||
|
MEDIUM = "medium",
|
||||||
|
HIGH = "high",
|
||||||
|
X_HIGH = "x-high",
|
||||||
|
DEFAULT = "default",
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
|
||||||
|
*/
|
||||||
|
export enum OUTPUT_FORMAT {
|
||||||
|
// Streaming =============================
|
||||||
|
// AMR_WB_16000HZ = "amr-wb-16000hz",
|
||||||
|
// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
|
||||||
|
// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
|
||||||
|
// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
|
||||||
|
// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
|
||||||
|
// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
|
||||||
|
// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
|
||||||
|
AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
|
||||||
|
AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
|
||||||
|
// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
|
||||||
|
// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
|
||||||
|
// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
|
||||||
|
// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
|
||||||
|
// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
|
||||||
|
// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
|
||||||
|
// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
|
||||||
|
// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
|
||||||
|
// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
|
||||||
|
// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
|
||||||
|
// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
|
||||||
|
// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
|
||||||
|
// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
|
||||||
|
// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
|
||||||
|
// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
|
||||||
|
// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
|
||||||
|
// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
|
||||||
|
// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
|
||||||
|
WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
|
||||||
|
// Non-streaming =============================
|
||||||
|
// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
|
||||||
|
// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
|
||||||
|
// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
|
||||||
|
// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
|
||||||
|
// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
|
||||||
|
// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
|
||||||
|
// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
|
||||||
|
}
|
||||||
|
|
||||||
|
export type Voice = {
|
||||||
|
Name: string;
|
||||||
|
ShortName: string;
|
||||||
|
Gender: string;
|
||||||
|
Locale: string;
|
||||||
|
SuggestedCodec: string;
|
||||||
|
FriendlyName: string;
|
||||||
|
Status: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class ProsodyOptions {
|
||||||
|
/**
|
||||||
|
* The pitch to use.
|
||||||
|
* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
|
||||||
|
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
|
||||||
|
*/
|
||||||
|
pitch?: PITCH | string = "+0Hz";
|
||||||
|
/**
|
||||||
|
* The rate to use.
|
||||||
|
* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
|
||||||
|
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
|
||||||
|
*/
|
||||||
|
rate?: RATE | string | number = 1.0;
|
||||||
|
/**
|
||||||
|
* The volume to use.
|
||||||
|
* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
|
||||||
|
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
|
||||||
|
*/
|
||||||
|
volume?: VOLUME | string | number = 100.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class MsEdgeTTS {
|
||||||
|
static OUTPUT_FORMAT = OUTPUT_FORMAT;
|
||||||
|
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
|
||||||
|
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
||||||
|
private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
||||||
|
private static BINARY_DELIM = "Path:audio\r\n";
|
||||||
|
private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
|
||||||
|
private readonly _enableLogger;
|
||||||
|
private _ws: WebSocket | undefined;
|
||||||
|
private _voice: any;
|
||||||
|
private _voiceLocale: any;
|
||||||
|
private _outputFormat: any;
|
||||||
|
private _streams: { [key: string]: Readable } = {};
|
||||||
|
private _startTime = 0;
|
||||||
|
|
||||||
|
private _log(...o: any[]) {
|
||||||
|
if (this._enableLogger) {
|
||||||
|
console.log(...o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new `MsEdgeTTS` instance.
|
||||||
|
*
|
||||||
|
* @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
|
||||||
|
* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
|
||||||
|
*/
|
||||||
|
public constructor(enableLogger: boolean = false) {
|
||||||
|
this._enableLogger = enableLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async _send(message: any) {
|
||||||
|
for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
|
||||||
|
if (i == 1) {
|
||||||
|
this._startTime = Date.now();
|
||||||
|
}
|
||||||
|
this._log("connecting: ", i);
|
||||||
|
await this._initClient();
|
||||||
|
}
|
||||||
|
this._ws!.send(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private _initClient() {
|
||||||
|
this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
|
||||||
|
|
||||||
|
this._ws.binaryType = "arraybuffer";
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
this._ws!.onopen = () => {
|
||||||
|
this._log(
|
||||||
|
"Connected in",
|
||||||
|
(Date.now() - this._startTime) / 1000,
|
||||||
|
"seconds",
|
||||||
|
);
|
||||||
|
this._send(
|
||||||
|
`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
|
||||||
|
{
|
||||||
|
"context": {
|
||||||
|
"synthesis": {
|
||||||
|
"audio": {
|
||||||
|
"metadataoptions": {
|
||||||
|
"sentenceBoundaryEnabled": "false",
|
||||||
|
"wordBoundaryEnabled": "false"
|
||||||
|
},
|
||||||
|
"outputFormat": "${this._outputFormat}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`,
|
||||||
|
).then(resolve);
|
||||||
|
};
|
||||||
|
this._ws!.onmessage = (m: any) => {
|
||||||
|
const buffer = Buffer.from(m.data as ArrayBuffer);
|
||||||
|
const message = buffer.toString();
|
||||||
|
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
|
||||||
|
if (message.includes("Path:turn.start")) {
|
||||||
|
// start of turn, ignore
|
||||||
|
} else if (message.includes("Path:turn.end")) {
|
||||||
|
// end of turn, close stream
|
||||||
|
this._streams[requestId].push(null);
|
||||||
|
} else if (message.includes("Path:response")) {
|
||||||
|
// context response, ignore
|
||||||
|
} else if (
|
||||||
|
message.includes("Path:audio") &&
|
||||||
|
m.data instanceof ArrayBuffer
|
||||||
|
) {
|
||||||
|
this._pushAudioData(buffer, requestId);
|
||||||
|
} else {
|
||||||
|
this._log("UNKNOWN MESSAGE", message);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
this._ws!.onclose = () => {
|
||||||
|
this._log(
|
||||||
|
"disconnected after:",
|
||||||
|
(Date.now() - this._startTime) / 1000,
|
||||||
|
"seconds",
|
||||||
|
);
|
||||||
|
for (const requestId in this._streams) {
|
||||||
|
this._streams[requestId].push(null);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
this._ws!.onerror = function (error: any) {
|
||||||
|
reject("Connect Error: " + error);
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private _pushAudioData(audioBuffer: Buffer, requestId: string) {
|
||||||
|
const audioStartIndex =
|
||||||
|
audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
|
||||||
|
MsEdgeTTS.BINARY_DELIM.length;
|
||||||
|
const audioData = audioBuffer.subarray(audioStartIndex);
|
||||||
|
this._streams[requestId].push(audioData);
|
||||||
|
this._log("received audio chunk, size: ", audioData?.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
|
||||||
|
// in case future updates to the edge API block these elements, we'll be concatenating strings.
|
||||||
|
options = { ...new ProsodyOptions(), ...options };
|
||||||
|
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
|
||||||
|
<voice name="${this._voice}">
|
||||||
|
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
|
||||||
|
${input}
|
||||||
|
</prosody>
|
||||||
|
</voice>
|
||||||
|
</speak>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch the list of voices available in Microsoft Edge.
|
||||||
|
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
|
||||||
|
*/
|
||||||
|
// getVoices(): Promise<Voice[]> {
|
||||||
|
// return new Promise((resolve, reject) => {
|
||||||
|
// axios
|
||||||
|
// .get(MsEdgeTTS.VOICES_URL)
|
||||||
|
// .then((res) => resolve(res.data))
|
||||||
|
// .catch(reject);
|
||||||
|
// });
|
||||||
|
// }
|
||||||
|
getVoices(): Promise<Voice[]> {
|
||||||
|
return fetch(MsEdgeTTS.VOICES_URL)
|
||||||
|
.then((response) => {
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error("Network response was not ok");
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
|
.then((data) => data as Voice[])
|
||||||
|
.catch((error) => {
|
||||||
|
throw error;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
|
||||||
|
* Must be called at least once before text can be synthesised.
|
||||||
|
* Saved in this instance. Can be called at any time times to update the metadata.
|
||||||
|
*
|
||||||
|
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
|
||||||
|
* @param outputFormat any {@link OUTPUT_FORMAT}
|
||||||
|
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
|
||||||
|
*/
|
||||||
|
async setMetadata(
|
||||||
|
voiceName: string,
|
||||||
|
outputFormat: OUTPUT_FORMAT,
|
||||||
|
voiceLocale?: string,
|
||||||
|
) {
|
||||||
|
const oldVoice = this._voice;
|
||||||
|
const oldVoiceLocale = this._voiceLocale;
|
||||||
|
const oldOutputFormat = this._outputFormat;
|
||||||
|
|
||||||
|
this._voice = voiceName;
|
||||||
|
this._voiceLocale = voiceLocale;
|
||||||
|
if (!this._voiceLocale) {
|
||||||
|
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
|
||||||
|
if (!voiceLangMatch)
|
||||||
|
throw new Error("Could not infer voiceLocale from voiceName!");
|
||||||
|
this._voiceLocale = voiceLangMatch[0];
|
||||||
|
}
|
||||||
|
this._outputFormat = outputFormat;
|
||||||
|
|
||||||
|
const changed =
|
||||||
|
oldVoice !== this._voice ||
|
||||||
|
oldVoiceLocale !== this._voiceLocale ||
|
||||||
|
oldOutputFormat !== this._outputFormat;
|
||||||
|
|
||||||
|
// create new client
|
||||||
|
if (changed || this._ws!.readyState !== this._ws!.OPEN) {
|
||||||
|
this._startTime = Date.now();
|
||||||
|
await this._initClient();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private _metadataCheck() {
|
||||||
|
if (!this._ws)
|
||||||
|
throw new Error(
|
||||||
|
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the WebSocket connection.
|
||||||
|
*/
|
||||||
|
close() {
|
||||||
|
this._ws!.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
|
||||||
|
*
|
||||||
|
* @param input the text to synthesise. Can include SSML elements.
|
||||||
|
* @param options (optional) {@link ProsodyOptions}
|
||||||
|
* @returns {Readable} - a `stream.Readable` with the audio data
|
||||||
|
*/
|
||||||
|
toStream(input: string, options?: ProsodyOptions): Readable {
|
||||||
|
const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let data: Uint8Array[] = [];
|
||||||
|
const readable = this.toStream(input, options);
|
||||||
|
readable.on("data", (chunk) => {
|
||||||
|
data.push(chunk);
|
||||||
|
});
|
||||||
|
|
||||||
|
readable.on("end", () => {
|
||||||
|
resolve(Buffer.concat(data).buffer);
|
||||||
|
});
|
||||||
|
|
||||||
|
readable.on("error", (err) => {
|
||||||
|
reject(err);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
|
||||||
|
*
|
||||||
|
* @param requestSSML the SSML to send. SSML elements required in order to work.
|
||||||
|
* @returns {Readable} - a `stream.Readable` with the audio data
|
||||||
|
*/
|
||||||
|
rawToStream(requestSSML: string): Readable {
|
||||||
|
const { stream } = this._rawSSMLRequest(requestSSML);
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
private _rawSSMLRequest(requestSSML: string): {
|
||||||
|
stream: Readable;
|
||||||
|
requestId: string;
|
||||||
|
} {
|
||||||
|
this._metadataCheck();
|
||||||
|
|
||||||
|
const requestId = randomBytes(16).toString("hex");
|
||||||
|
const request =
|
||||||
|
`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
|
||||||
|
` + requestSSML.trim();
|
||||||
|
// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
|
||||||
|
const self = this;
|
||||||
|
const stream = new Readable({
|
||||||
|
read() {},
|
||||||
|
destroy(error: Error | null, callback: (error: Error | null) => void) {
|
||||||
|
delete self._streams[requestId];
|
||||||
|
callback(error);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
this._streams[requestId] = stream;
|
||||||
|
this._send(request).then();
|
||||||
|
return { stream, requestId };
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,126 @@
|
||||||
|
import { ChatGPTApi } from "../client/platforms/openai";
|
||||||
|
import { getSTTLang } from "../locales";
|
||||||
|
import { isFirefox } from "../utils";
|
||||||
|
|
||||||
|
export type TranscriptionCallback = (transcription: string) => void;
|
||||||
|
|
||||||
|
export abstract class SpeechApi {
|
||||||
|
protected onTranscription: TranscriptionCallback = () => {};
|
||||||
|
|
||||||
|
abstract isListening(): boolean;
|
||||||
|
abstract start(): Promise<void>;
|
||||||
|
abstract stop(): Promise<void>;
|
||||||
|
|
||||||
|
onTranscriptionReceived(callback: TranscriptionCallback) {
|
||||||
|
this.onTranscription = callback;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class OpenAITranscriptionApi extends SpeechApi {
|
||||||
|
private listeningStatus = false;
|
||||||
|
private mediaRecorder: MediaRecorder | null = null;
|
||||||
|
private stream: MediaStream | null = null;
|
||||||
|
private audioChunks: Blob[] = [];
|
||||||
|
|
||||||
|
isListening = () => this.listeningStatus;
|
||||||
|
|
||||||
|
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||||
|
super();
|
||||||
|
if (transcriptionCallback) {
|
||||||
|
this.onTranscriptionReceived(transcriptionCallback);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async start(): Promise<void> {
|
||||||
|
// @ts-ignore
|
||||||
|
navigator.getUserMedia =
|
||||||
|
// @ts-ignore
|
||||||
|
navigator.getUserMedia ||
|
||||||
|
// @ts-ignore
|
||||||
|
navigator.webkitGetUserMedia ||
|
||||||
|
// @ts-ignore
|
||||||
|
navigator.mozGetUserMedia ||
|
||||||
|
// @ts-ignore
|
||||||
|
navigator.msGetUserMedia;
|
||||||
|
if (navigator.mediaDevices) {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
this.mediaRecorder = new MediaRecorder(stream);
|
||||||
|
this.mediaRecorder.ondataavailable = (e) => {
|
||||||
|
if (e.data && e.data.size > 0) {
|
||||||
|
this.audioChunks.push(e.data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
this.stream = stream;
|
||||||
|
} else {
|
||||||
|
console.warn("Media Decives will work only with SSL");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.audioChunks = [];
|
||||||
|
|
||||||
|
// this.recorder.addEventListener("dataavailable", (event) => {
|
||||||
|
// this.audioChunks.push(event.data);
|
||||||
|
// });
|
||||||
|
|
||||||
|
this.mediaRecorder.start(1000);
|
||||||
|
this.listeningStatus = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async stop(): Promise<void> {
|
||||||
|
if (!this.mediaRecorder || !this.listeningStatus) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
this.mediaRecorder!.addEventListener("stop", async () => {
|
||||||
|
const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
|
||||||
|
const llm = new ChatGPTApi();
|
||||||
|
const transcription = await llm.transcription({ file: audioBlob });
|
||||||
|
this.onTranscription(transcription);
|
||||||
|
this.listeningStatus = false;
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
|
||||||
|
this.mediaRecorder!.stop();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class WebTranscriptionApi extends SpeechApi {
|
||||||
|
private listeningStatus = false;
|
||||||
|
private recognitionInstance: any | null = null;
|
||||||
|
|
||||||
|
isListening = () => this.listeningStatus;
|
||||||
|
|
||||||
|
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||||
|
super();
|
||||||
|
if (isFirefox()) return;
|
||||||
|
const SpeechRecognition =
|
||||||
|
(window as any).SpeechRecognition ||
|
||||||
|
(window as any).webkitSpeechRecognition;
|
||||||
|
this.recognitionInstance = new SpeechRecognition();
|
||||||
|
this.recognitionInstance.continuous = true;
|
||||||
|
this.recognitionInstance.interimResults = true;
|
||||||
|
this.recognitionInstance.lang = getSTTLang();
|
||||||
|
if (transcriptionCallback) {
|
||||||
|
this.onTranscriptionReceived(transcriptionCallback);
|
||||||
|
}
|
||||||
|
this.recognitionInstance.onresult = (event: any) => {
|
||||||
|
const result = event.results[event.results.length - 1];
|
||||||
|
if (result.isFinal) {
|
||||||
|
this.onTranscription(result[0].transcript);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async start(): Promise<void> {
|
||||||
|
this.listeningStatus = true;
|
||||||
|
await this.recognitionInstance.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
async stop(): Promise<void> {
|
||||||
|
this.listeningStatus = false;
|
||||||
|
await this.recognitionInstance.stop();
|
||||||
|
}
|
||||||
|
}
|
|
@ -30,6 +30,7 @@
|
||||||
"html-to-image": "^1.11.11",
|
"html-to-image": "^1.11.11",
|
||||||
"lodash-es": "^4.17.21",
|
"lodash-es": "^4.17.21",
|
||||||
"mermaid": "^10.6.1",
|
"mermaid": "^10.6.1",
|
||||||
|
"markdown-to-txt": "^2.0.1",
|
||||||
"nanoid": "^5.0.3",
|
"nanoid": "^5.0.3",
|
||||||
"next": "^14.1.1",
|
"next": "^14.1.1",
|
||||||
"node-fetch": "^3.3.1",
|
"node-fetch": "^3.3.1",
|
||||||
|
|
24
yarn.lock
24
yarn.lock
|
@ -4378,11 +4378,21 @@ lodash.debounce@^4.0.8:
|
||||||
resolved "https://registry.yarnpkg.com/lodash.debounce/-/lodash.debounce-4.0.8.tgz#82d79bff30a67c4005ffd5e2515300ad9ca4d7af"
|
resolved "https://registry.yarnpkg.com/lodash.debounce/-/lodash.debounce-4.0.8.tgz#82d79bff30a67c4005ffd5e2515300ad9ca4d7af"
|
||||||
integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==
|
integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==
|
||||||
|
|
||||||
|
lodash.escape@^4.0.1:
|
||||||
|
version "4.0.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98"
|
||||||
|
integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==
|
||||||
|
|
||||||
lodash.merge@^4.6.2:
|
lodash.merge@^4.6.2:
|
||||||
version "4.6.2"
|
version "4.6.2"
|
||||||
resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a"
|
resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a"
|
||||||
integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==
|
integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==
|
||||||
|
|
||||||
|
lodash.unescape@^4.0.1:
|
||||||
|
version "4.0.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c"
|
||||||
|
integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg==
|
||||||
|
|
||||||
lodash@^4.17.21:
|
lodash@^4.17.21:
|
||||||
version "4.17.21"
|
version "4.17.21"
|
||||||
resolved "https://registry.npmmirror.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
|
resolved "https://registry.npmmirror.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
|
||||||
|
@ -4438,6 +4448,20 @@ markdown-table@^3.0.0:
|
||||||
resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd"
|
resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd"
|
||||||
integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==
|
integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==
|
||||||
|
|
||||||
|
markdown-to-txt@^2.0.1:
|
||||||
|
version "2.0.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5"
|
||||||
|
integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw==
|
||||||
|
dependencies:
|
||||||
|
lodash.escape "^4.0.1"
|
||||||
|
lodash.unescape "^4.0.1"
|
||||||
|
marked "^4.0.14"
|
||||||
|
|
||||||
|
marked@^4.0.14:
|
||||||
|
version "4.3.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
|
||||||
|
integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
|
||||||
|
|
||||||
mdast-util-definitions@^5.0.0:
|
mdast-util-definitions@^5.0.0:
|
||||||
version "5.1.2"
|
version "5.1.2"
|
||||||
resolved "https://registry.yarnpkg.com/mdast-util-definitions/-/mdast-util-definitions-5.1.2.tgz#9910abb60ac5d7115d6819b57ae0bcef07a3f7a7"
|
resolved "https://registry.yarnpkg.com/mdast-util-definitions/-/mdast-util-definitions-5.1.2.tgz#9910abb60ac5d7115d6819b57ae0bcef07a3f7a7"
|
||||||
|
|
Loading…
Reference in New Issue