mirror of
https://github.com/Yidadaa/ChatGPT-Next-Web.git
synced 2025-08-06 09:47:06 +08:00
feat: add tts stt
This commit is contained in:
@@ -10,11 +10,14 @@ import React, {
|
||||
} from "react";
|
||||
|
||||
import SendWhiteIcon from "../icons/send-white.svg";
|
||||
import VoiceWhiteIcon from "../icons/voice-white.svg";
|
||||
import BrainIcon from "../icons/brain.svg";
|
||||
import RenameIcon from "../icons/rename.svg";
|
||||
import ExportIcon from "../icons/share.svg";
|
||||
import ReturnIcon from "../icons/return.svg";
|
||||
import CopyIcon from "../icons/copy.svg";
|
||||
import SpeakIcon from "../icons/speak.svg";
|
||||
import SpeakStopIcon from "../icons/speak-stop.svg";
|
||||
import LoadingIcon from "../icons/three-dots.svg";
|
||||
import LoadingButtonIcon from "../icons/loading.svg";
|
||||
import PromptIcon from "../icons/prompt.svg";
|
||||
@@ -64,6 +67,7 @@ import {
|
||||
getMessageImages,
|
||||
isVisionModel,
|
||||
isDalle3,
|
||||
isFirefox,
|
||||
} from "../utils";
|
||||
|
||||
import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
|
||||
@@ -73,7 +77,7 @@ import dynamic from "next/dynamic";
|
||||
import { ChatControllerPool } from "../client/controller";
|
||||
import { DalleSize, DalleQuality, DalleStyle } from "../typing";
|
||||
import { Prompt, usePromptStore } from "../store/prompt";
|
||||
import Locale from "../locales";
|
||||
import Locale, { getLang, getSTTLang } from "../locales";
|
||||
|
||||
import { IconButton } from "./button";
|
||||
import styles from "./chat.module.scss";
|
||||
@@ -90,6 +94,10 @@ import {
|
||||
import { useNavigate } from "react-router-dom";
|
||||
import {
|
||||
CHAT_PAGE_SIZE,
|
||||
DEFAULT_STT_ENGINE,
|
||||
DEFAULT_TTS_ENGINE,
|
||||
FIREFOX_DEFAULT_STT_ENGINE,
|
||||
ModelProvider,
|
||||
LAST_INPUT_KEY,
|
||||
Path,
|
||||
REQUEST_TIMEOUT_MS,
|
||||
@@ -106,6 +114,16 @@ import { ExportMessageModal } from "./exporter";
|
||||
import { getClientConfig } from "../config/client";
|
||||
import { useAllModels } from "../utils/hooks";
|
||||
import { MultimodalContent } from "../client/api";
|
||||
import { ClientApi } from "../client/api";
|
||||
import { createTTSPlayer } from "../utils/audio";
|
||||
import {
|
||||
OpenAITranscriptionApi,
|
||||
SpeechApi,
|
||||
WebTranscriptionApi,
|
||||
} from "../utils/speech";
|
||||
import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
|
||||
|
||||
const ttsPlayer = createTTSPlayer();
|
||||
|
||||
const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
|
||||
loading: () => <LoadingIcon />,
|
||||
@@ -922,6 +940,33 @@ function _Chat() {
|
||||
}
|
||||
};
|
||||
|
||||
const [isListening, setIsListening] = useState(false);
|
||||
const [isTranscription, setIsTranscription] = useState(false);
|
||||
const [speechApi, setSpeechApi] = useState<any>(null);
|
||||
|
||||
const startListening = async () => {
|
||||
if (speechApi) {
|
||||
await speechApi.start();
|
||||
setIsListening(true);
|
||||
}
|
||||
};
|
||||
|
||||
const stopListening = async () => {
|
||||
if (speechApi) {
|
||||
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||
setIsTranscription(true);
|
||||
await speechApi.stop();
|
||||
setIsListening(false);
|
||||
}
|
||||
};
|
||||
|
||||
const onRecognitionEnd = (finalTranscript: string) => {
|
||||
console.log(finalTranscript);
|
||||
if (finalTranscript) setUserInput(finalTranscript);
|
||||
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||
setIsTranscription(false);
|
||||
};
|
||||
|
||||
const doSubmit = (userInput: string) => {
|
||||
if (userInput.trim() === "") return;
|
||||
const matchCommand = chatCommands.match(userInput);
|
||||
@@ -992,6 +1037,16 @@ function _Chat() {
|
||||
}
|
||||
});
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
|
||||
setSpeechApi(
|
||||
config.sttConfig.engine === DEFAULT_STT_ENGINE
|
||||
? new WebTranscriptionApi((transcription) =>
|
||||
onRecognitionEnd(transcription),
|
||||
)
|
||||
: new OpenAITranscriptionApi((transcription) =>
|
||||
onRecognitionEnd(transcription),
|
||||
),
|
||||
);
|
||||
}, []);
|
||||
|
||||
// check if should send message
|
||||
@@ -1102,10 +1157,55 @@ function _Chat() {
|
||||
});
|
||||
};
|
||||
|
||||
const accessStore = useAccessStore();
|
||||
const [speechStatus, setSpeechStatus] = useState(false);
|
||||
const [speechLoading, setSpeechLoading] = useState(false);
|
||||
async function openaiSpeech(text: string) {
|
||||
if (speechStatus) {
|
||||
ttsPlayer.stop();
|
||||
setSpeechStatus(false);
|
||||
} else {
|
||||
var api: ClientApi;
|
||||
api = new ClientApi(ModelProvider.GPT);
|
||||
const config = useAppConfig.getState();
|
||||
setSpeechLoading(true);
|
||||
ttsPlayer.init();
|
||||
let audioBuffer: ArrayBuffer;
|
||||
const { markdownToTxt } = require("markdown-to-txt");
|
||||
const textContent = markdownToTxt(text);
|
||||
if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
|
||||
const edgeVoiceName = accessStore.edgeVoiceName();
|
||||
const tts = new MsEdgeTTS();
|
||||
await tts.setMetadata(
|
||||
edgeVoiceName,
|
||||
OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
|
||||
);
|
||||
audioBuffer = await tts.toArrayBuffer(textContent);
|
||||
} else {
|
||||
audioBuffer = await api.llm.speech({
|
||||
model: config.ttsConfig.model,
|
||||
input: textContent,
|
||||
voice: config.ttsConfig.voice,
|
||||
speed: config.ttsConfig.speed,
|
||||
});
|
||||
}
|
||||
setSpeechStatus(true);
|
||||
ttsPlayer
|
||||
.play(audioBuffer, () => {
|
||||
setSpeechStatus(false);
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error("[OpenAI Speech]", e);
|
||||
showToast(prettyObject(e));
|
||||
setSpeechStatus(false);
|
||||
})
|
||||
.finally(() => setSpeechLoading(false));
|
||||
}
|
||||
}
|
||||
|
||||
const context: RenderMessage[] = useMemo(() => {
|
||||
return session.mask.hideContext ? [] : session.mask.context.slice();
|
||||
}, [session.mask.context, session.mask.hideContext]);
|
||||
const accessStore = useAccessStore();
|
||||
|
||||
if (
|
||||
context.length === 0 &&
|
||||
@@ -1567,6 +1667,26 @@ function _Chat() {
|
||||
)
|
||||
}
|
||||
/>
|
||||
{config.ttsConfig.enable && (
|
||||
<ChatAction
|
||||
text={
|
||||
speechStatus
|
||||
? Locale.Chat.Actions.StopSpeech
|
||||
: Locale.Chat.Actions.Speech
|
||||
}
|
||||
loding={speechLoading}
|
||||
icon={
|
||||
speechStatus ? (
|
||||
<SpeakStopIcon />
|
||||
) : (
|
||||
<SpeakIcon />
|
||||
)
|
||||
}
|
||||
onClick={() =>
|
||||
openaiSpeech(getMessageTextContent(message))
|
||||
}
|
||||
/>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
@@ -1714,13 +1834,35 @@ function _Chat() {
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
<IconButton
|
||||
{config.sttConfig.enable ? (
|
||||
<IconButton
|
||||
icon={<VoiceWhiteIcon />}
|
||||
text={
|
||||
isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak
|
||||
}
|
||||
className={styles["chat-input-send"]}
|
||||
type="primary"
|
||||
onClick={async () =>
|
||||
isListening ? await stopListening() : await startListening()
|
||||
}
|
||||
loding={isTranscription}
|
||||
/>
|
||||
) : (
|
||||
<IconButton
|
||||
icon={<SendWhiteIcon />}
|
||||
text={Locale.Chat.Send}
|
||||
className={styles["chat-input-send"]}
|
||||
type="primary"
|
||||
onClick={() => doSubmit(userInput)}
|
||||
/>
|
||||
)}
|
||||
{/* <IconButton
|
||||
icon={<SendWhiteIcon />}
|
||||
text={Locale.Chat.Send}
|
||||
className={styles["chat-input-send"]}
|
||||
type="primary"
|
||||
onClick={() => doSubmit(userInput)}
|
||||
/>
|
||||
/> */}
|
||||
</label>
|
||||
</div>
|
||||
|
||||
|
Reference in New Issue
Block a user