feat: tts
This commit is contained in:
parent
212605a7e3
commit
3ae8ec1af6
|
@ -64,16 +64,6 @@ export interface SpeechOptions {
|
|||
onController?: (controller: AbortController) => void;
|
||||
}
|
||||
|
||||
export interface TranscriptionOptions {
|
||||
model?: "whisper-1";
|
||||
file: Blob;
|
||||
language?: string;
|
||||
prompt?: string;
|
||||
response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
|
||||
temperature?: number;
|
||||
onController?: (controller: AbortController) => void;
|
||||
}
|
||||
|
||||
export interface ChatOptions {
|
||||
messages: RequestMessage[];
|
||||
config: LLMConfig;
|
||||
|
@ -109,7 +99,6 @@ export interface LLMModelProvider {
|
|||
export abstract class LLMApi {
|
||||
abstract chat(options: ChatOptions): Promise<void>;
|
||||
abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
|
||||
abstract transcription(options: TranscriptionOptions): Promise<string>;
|
||||
abstract usage(): Promise<LLMUsage>;
|
||||
abstract models(): Promise<LLMModel[]>;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,6 @@ import {
|
|||
LLMApi,
|
||||
LLMModel,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
MultimodalContent,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
|
@ -88,9 +87,6 @@ export class QwenApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const messages = options.messages.map((v) => ({
|
||||
|
|
|
@ -5,7 +5,6 @@ import {
|
|||
LLMApi,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import {
|
||||
useAccessStore,
|
||||
|
@ -90,9 +89,6 @@ export class ClaudeApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
extractMessage(res: any) {
|
||||
console.log("[Response] claude response: ", res);
|
||||
|
|
|
@ -15,7 +15,6 @@ import {
|
|||
LLMModel,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -80,9 +79,6 @@ export class ErnieApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const messages = options.messages.map((v) => ({
|
||||
|
|
|
@ -14,7 +14,6 @@ import {
|
|||
LLMModel,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -82,9 +81,6 @@ export class DoubaoApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const messages = options.messages.map((v) => ({
|
||||
|
|
|
@ -6,7 +6,6 @@ import {
|
|||
LLMModel,
|
||||
LLMUsage,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
|
||||
import { getClientConfig } from "@/app/config/client";
|
||||
|
@ -67,9 +66,7 @@ export class GeminiProApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions): Promise<void> {
|
||||
const apiClient = this;
|
||||
let multimodal = false;
|
||||
|
|
|
@ -13,7 +13,6 @@ import {
|
|||
LLMApi,
|
||||
LLMModel,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -63,9 +62,6 @@ export class SparkApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const messages: ChatOptions["messages"] = [];
|
||||
|
|
|
@ -27,7 +27,6 @@ import {
|
|||
LLMUsage,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -77,9 +76,6 @@ export class MoonshotApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const messages: ChatOptions["messages"] = [];
|
||||
|
|
|
@ -34,7 +34,6 @@ import {
|
|||
LLMUsage,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -187,47 +186,6 @@ export class ChatGPTApi implements LLMApi {
|
|||
}
|
||||
}
|
||||
|
||||
async transcription(options: TranscriptionOptions): Promise<string> {
|
||||
const formData = new FormData();
|
||||
formData.append("file", options.file, "audio.wav");
|
||||
formData.append("model", options.model ?? "whisper-1");
|
||||
if (options.language) formData.append("language", options.language);
|
||||
if (options.prompt) formData.append("prompt", options.prompt);
|
||||
if (options.response_format)
|
||||
formData.append("response_format", options.response_format);
|
||||
if (options.temperature)
|
||||
formData.append("temperature", options.temperature.toString());
|
||||
|
||||
console.log("[Request] openai audio transcriptions payload: ", options);
|
||||
|
||||
const controller = new AbortController();
|
||||
options.onController?.(controller);
|
||||
|
||||
try {
|
||||
const path = this.path(OpenaiPath.TranscriptionPath, options.model);
|
||||
const headers = getHeaders(true);
|
||||
const payload = {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
signal: controller.signal,
|
||||
headers: headers,
|
||||
};
|
||||
|
||||
// make a fetch request
|
||||
const requestTimeoutId = setTimeout(
|
||||
() => controller.abort(),
|
||||
REQUEST_TIMEOUT_MS,
|
||||
);
|
||||
const res = await fetch(path, payload);
|
||||
clearTimeout(requestTimeoutId);
|
||||
const json = await res.json();
|
||||
return json.text;
|
||||
} catch (e) {
|
||||
console.log("[Request] failed to make a audio transcriptions request", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const modelConfig = {
|
||||
...useAppConfig.getState().modelConfig,
|
||||
|
|
|
@ -9,7 +9,6 @@ import {
|
|||
LLMModel,
|
||||
MultimodalContent,
|
||||
SpeechOptions,
|
||||
TranscriptionOptions,
|
||||
} from "../api";
|
||||
import Locale from "../../locales";
|
||||
import {
|
||||
|
@ -94,9 +93,6 @@ export class HunyuanApi implements LLMApi {
|
|||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
transcription(options: TranscriptionOptions): Promise<string> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
const visionModel = isVisionModel(options.config.model);
|
||||
|
|
|
@ -10,7 +10,6 @@ import React, {
|
|||
} from "react";
|
||||
|
||||
import SendWhiteIcon from "../icons/send-white.svg";
|
||||
import VoiceWhiteIcon from "../icons/voice-white.svg";
|
||||
import BrainIcon from "../icons/brain.svg";
|
||||
import RenameIcon from "../icons/rename.svg";
|
||||
import ExportIcon from "../icons/share.svg";
|
||||
|
@ -83,7 +82,7 @@ import dynamic from "next/dynamic";
|
|||
import { ChatControllerPool } from "../client/controller";
|
||||
import { DalleSize, DalleQuality, DalleStyle } from "../typing";
|
||||
import { Prompt, usePromptStore } from "../store/prompt";
|
||||
import Locale, { getLang, getSTTLang } from "../locales";
|
||||
import Locale from "../locales";
|
||||
|
||||
import { IconButton } from "./button";
|
||||
import styles from "./chat.module.scss";
|
||||
|
@ -100,9 +99,7 @@ import {
|
|||
import { useNavigate } from "react-router-dom";
|
||||
import {
|
||||
CHAT_PAGE_SIZE,
|
||||
DEFAULT_STT_ENGINE,
|
||||
DEFAULT_TTS_ENGINE,
|
||||
FIREFOX_DEFAULT_STT_ENGINE,
|
||||
ModelProvider,
|
||||
LAST_INPUT_KEY,
|
||||
Path,
|
||||
|
@ -123,11 +120,6 @@ import { MultimodalContent } from "../client/api";
|
|||
const localStorage = safeLocalStorage();
|
||||
import { ClientApi } from "../client/api";
|
||||
import { createTTSPlayer } from "../utils/audio";
|
||||
import {
|
||||
OpenAITranscriptionApi,
|
||||
SpeechApi,
|
||||
WebTranscriptionApi,
|
||||
} from "../utils/speech";
|
||||
import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
|
||||
|
||||
const ttsPlayer = createTTSPlayer();
|
||||
|
@ -556,44 +548,6 @@ export function ChatActions(props: {
|
|||
}
|
||||
}, [chatStore, currentModel, models]);
|
||||
|
||||
const [isListening, setIsListening] = useState(false);
|
||||
const [isTranscription, setIsTranscription] = useState(false);
|
||||
const [speechApi, setSpeechApi] = useState<any>(null);
|
||||
|
||||
useEffect(() => {
|
||||
if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
|
||||
setSpeechApi(
|
||||
config.sttConfig.engine === DEFAULT_STT_ENGINE
|
||||
? new WebTranscriptionApi((transcription) =>
|
||||
onRecognitionEnd(transcription),
|
||||
)
|
||||
: new OpenAITranscriptionApi((transcription) =>
|
||||
onRecognitionEnd(transcription),
|
||||
),
|
||||
);
|
||||
}, []);
|
||||
|
||||
const startListening = async () => {
|
||||
if (speechApi) {
|
||||
await speechApi.start();
|
||||
setIsListening(true);
|
||||
}
|
||||
};
|
||||
const stopListening = async () => {
|
||||
if (speechApi) {
|
||||
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||
setIsTranscription(true);
|
||||
await speechApi.stop();
|
||||
setIsListening(false);
|
||||
}
|
||||
};
|
||||
const onRecognitionEnd = (finalTranscript: string) => {
|
||||
console.log(finalTranscript);
|
||||
if (finalTranscript) props.setUserInput(finalTranscript);
|
||||
if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
|
||||
setIsTranscription(false);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className={styles["chat-input-actions"]}>
|
||||
{couldStop && (
|
||||
|
@ -828,16 +782,6 @@ export function ChatActions(props: {
|
|||
icon={<ShortcutkeyIcon />}
|
||||
/>
|
||||
)}
|
||||
|
||||
{config.sttConfig.enable && (
|
||||
<ChatAction
|
||||
onClick={async () =>
|
||||
isListening ? await stopListening() : await startListening()
|
||||
}
|
||||
text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
|
||||
icon={<VoiceWhiteIcon />}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
|
@ -81,7 +81,6 @@ import { nanoid } from "nanoid";
|
|||
import { useMaskStore } from "../store/mask";
|
||||
import { ProviderType } from "../utils/cloud";
|
||||
import { TTSConfigList } from "./tts-config";
|
||||
import { STTConfigList } from "./stt-config";
|
||||
|
||||
function EditPromptModal(props: { id: string; onClose: () => void }) {
|
||||
const promptStore = usePromptStore();
|
||||
|
@ -1659,17 +1658,6 @@ export function Settings() {
|
|||
/>
|
||||
</List>
|
||||
|
||||
<List>
|
||||
<STTConfigList
|
||||
sttConfig={config.sttConfig}
|
||||
updateConfig={(updater) => {
|
||||
const sttConfig = { ...config.sttConfig };
|
||||
updater(sttConfig);
|
||||
config.update((config) => (config.sttConfig = sttConfig));
|
||||
}}
|
||||
/>
|
||||
</List>
|
||||
|
||||
<DangerItems />
|
||||
</div>
|
||||
</ErrorBoundary>
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
import { STTConfig, STTConfigValidator } from "../store";
|
||||
|
||||
import Locale from "../locales";
|
||||
import { ListItem, Select } from "./ui-lib";
|
||||
import { DEFAULT_STT_ENGINES } from "../constant";
|
||||
import { isFirefox } from "../utils";
|
||||
|
||||
export function STTConfigList(props: {
|
||||
sttConfig: STTConfig;
|
||||
updateConfig: (updater: (config: STTConfig) => void) => void;
|
||||
}) {
|
||||
return (
|
||||
<>
|
||||
<ListItem
|
||||
title={Locale.Settings.STT.Enable.Title}
|
||||
subTitle={Locale.Settings.STT.Enable.SubTitle}
|
||||
>
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={props.sttConfig.enable}
|
||||
onChange={(e) =>
|
||||
props.updateConfig(
|
||||
(config) => (config.enable = e.currentTarget.checked),
|
||||
)
|
||||
}
|
||||
></input>
|
||||
</ListItem>
|
||||
{!isFirefox() && (
|
||||
<ListItem title={Locale.Settings.STT.Engine.Title}>
|
||||
<Select
|
||||
value={props.sttConfig.engine}
|
||||
onChange={(e) => {
|
||||
props.updateConfig(
|
||||
(config) =>
|
||||
(config.engine = STTConfigValidator.engine(
|
||||
e.currentTarget.value,
|
||||
)),
|
||||
);
|
||||
}}
|
||||
>
|
||||
{DEFAULT_STT_ENGINES.map((v, i) => (
|
||||
<option value={v} key={i}>
|
||||
{v}
|
||||
</option>
|
||||
))}
|
||||
</Select>
|
||||
</ListItem>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
|
@ -1,119 +0,0 @@
|
|||
@import "../styles/animation.scss";
|
||||
.plugin-page {
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
.plugin-page-body {
|
||||
padding: 20px;
|
||||
overflow-y: auto;
|
||||
|
||||
.plugin-filter {
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
margin-bottom: 20px;
|
||||
animation: slide-in ease 0.3s;
|
||||
height: 40px;
|
||||
|
||||
display: flex;
|
||||
|
||||
.search-bar {
|
||||
flex-grow: 1;
|
||||
max-width: 100%;
|
||||
min-width: 0;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.search-bar:focus {
|
||||
border: 1px solid var(--primary);
|
||||
}
|
||||
|
||||
.plugin-filter-lang {
|
||||
height: 100%;
|
||||
margin-left: 10px;
|
||||
}
|
||||
|
||||
.plugin-create {
|
||||
height: 100%;
|
||||
margin-left: 10px;
|
||||
box-sizing: border-box;
|
||||
min-width: 80px;
|
||||
}
|
||||
}
|
||||
|
||||
.plugin-item {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
padding: 20px;
|
||||
border: var(--border-in-light);
|
||||
animation: slide-in ease 0.3s;
|
||||
|
||||
&:not(:last-child) {
|
||||
border-bottom: 0;
|
||||
}
|
||||
|
||||
&:first-child {
|
||||
border-top-left-radius: 10px;
|
||||
border-top-right-radius: 10px;
|
||||
}
|
||||
|
||||
&:last-child {
|
||||
border-bottom-left-radius: 10px;
|
||||
border-bottom-right-radius: 10px;
|
||||
}
|
||||
|
||||
.plugin-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
|
||||
.plugin-icon {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
.plugin-title {
|
||||
.plugin-name {
|
||||
font-size: 14px;
|
||||
font-weight: bold;
|
||||
}
|
||||
.plugin-info {
|
||||
font-size: 12px;
|
||||
}
|
||||
.plugin-runtime-warning {
|
||||
font-size: 12px;
|
||||
color: #f86c6c;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.plugin-actions {
|
||||
display: flex;
|
||||
flex-wrap: nowrap;
|
||||
transition: all ease 0.3s;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
padding-bottom: 10px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: var(--card-shadow);
|
||||
|
||||
&:not(:last-child) {
|
||||
border-bottom: var(--border-in-light);
|
||||
}
|
||||
|
||||
.plugin-actions {
|
||||
width: 100%;
|
||||
justify-content: space-between;
|
||||
padding-top: 10px;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -153,7 +153,6 @@ export const Anthropic = {
|
|||
export const OpenaiPath = {
|
||||
ChatPath: "v1/chat/completions",
|
||||
SpeechPath: "v1/audio/speech",
|
||||
TranscriptionPath: "v1/audio/transcriptions",
|
||||
ImagePath: "v1/images/generations",
|
||||
UsagePath: "dashboard/billing/usage",
|
||||
SubsPath: "dashboard/billing/subscription",
|
||||
|
@ -274,10 +273,6 @@ export const DEFAULT_TTS_VOICES = [
|
|||
"shimmer",
|
||||
];
|
||||
|
||||
export const DEFAULT_STT_ENGINE = "WebAPI";
|
||||
export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"];
|
||||
export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
|
||||
|
||||
const openaiModels = [
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-1106",
|
||||
|
|
|
@ -520,16 +520,6 @@ const cn = {
|
|||
SubTitle: "生成语音的速度",
|
||||
},
|
||||
},
|
||||
STT: {
|
||||
Enable: {
|
||||
Title: "启用语音转文本",
|
||||
SubTitle: "启用语音转文本",
|
||||
},
|
||||
Engine: {
|
||||
Title: "转换引擎",
|
||||
SubTitle: "音频转换引擎",
|
||||
},
|
||||
},
|
||||
},
|
||||
Store: {
|
||||
DefaultTopic: "新的聊天",
|
||||
|
|
|
@ -527,16 +527,6 @@ const en: LocaleType = {
|
|||
},
|
||||
Engine: "TTS Engine",
|
||||
},
|
||||
STT: {
|
||||
Enable: {
|
||||
Title: "Enable STT",
|
||||
SubTitle: "Enable Speech-to-Text",
|
||||
},
|
||||
Engine: {
|
||||
Title: "STT Engine",
|
||||
SubTitle: "Text-to-Speech Engine",
|
||||
},
|
||||
},
|
||||
},
|
||||
Store: {
|
||||
DefaultTopic: "New Conversation",
|
||||
|
|
|
@ -5,8 +5,6 @@ import {
|
|||
DEFAULT_INPUT_TEMPLATE,
|
||||
DEFAULT_MODELS,
|
||||
DEFAULT_SIDEBAR_WIDTH,
|
||||
DEFAULT_STT_ENGINE,
|
||||
DEFAULT_STT_ENGINES,
|
||||
DEFAULT_TTS_ENGINE,
|
||||
DEFAULT_TTS_ENGINES,
|
||||
DEFAULT_TTS_MODEL,
|
||||
|
@ -23,8 +21,6 @@ export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
|
|||
export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
|
||||
export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
|
||||
|
||||
export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
|
||||
|
||||
export enum SubmitKey {
|
||||
Enter = "Enter",
|
||||
CtrlEnter = "Ctrl + Enter",
|
||||
|
@ -90,17 +86,12 @@ export const DEFAULT_CONFIG = {
|
|||
voice: DEFAULT_TTS_VOICE,
|
||||
speed: 1.0,
|
||||
},
|
||||
sttConfig: {
|
||||
enable: false,
|
||||
engine: DEFAULT_STT_ENGINE,
|
||||
},
|
||||
};
|
||||
|
||||
export type ChatConfig = typeof DEFAULT_CONFIG;
|
||||
|
||||
export type ModelConfig = ChatConfig["modelConfig"];
|
||||
export type TTSConfig = ChatConfig["ttsConfig"];
|
||||
export type STTConfig = ChatConfig["sttConfig"];
|
||||
|
||||
export function limitNumber(
|
||||
x: number,
|
||||
|
@ -130,12 +121,6 @@ export const TTSConfigValidator = {
|
|||
},
|
||||
};
|
||||
|
||||
export const STTConfigValidator = {
|
||||
engine(x: string) {
|
||||
return x as STTEngineType;
|
||||
},
|
||||
};
|
||||
|
||||
export const ModalConfigValidator = {
|
||||
model(x: string) {
|
||||
return x as ModelType;
|
||||
|
|
|
@ -1,126 +0,0 @@
|
|||
import { ChatGPTApi } from "../client/platforms/openai";
|
||||
import { getSTTLang } from "../locales";
|
||||
import { isFirefox } from "../utils";
|
||||
|
||||
export type TranscriptionCallback = (transcription: string) => void;
|
||||
|
||||
export abstract class SpeechApi {
|
||||
protected onTranscription: TranscriptionCallback = () => {};
|
||||
|
||||
abstract isListening(): boolean;
|
||||
abstract start(): Promise<void>;
|
||||
abstract stop(): Promise<void>;
|
||||
|
||||
onTranscriptionReceived(callback: TranscriptionCallback) {
|
||||
this.onTranscription = callback;
|
||||
}
|
||||
}
|
||||
|
||||
export class OpenAITranscriptionApi extends SpeechApi {
|
||||
private listeningStatus = false;
|
||||
private mediaRecorder: MediaRecorder | null = null;
|
||||
private stream: MediaStream | null = null;
|
||||
private audioChunks: Blob[] = [];
|
||||
|
||||
isListening = () => this.listeningStatus;
|
||||
|
||||
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||
super();
|
||||
if (transcriptionCallback) {
|
||||
this.onTranscriptionReceived(transcriptionCallback);
|
||||
}
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
// @ts-ignore
|
||||
navigator.getUserMedia =
|
||||
// @ts-ignore
|
||||
navigator.getUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.webkitGetUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.mozGetUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.msGetUserMedia;
|
||||
if (navigator.mediaDevices) {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
this.mediaRecorder = new MediaRecorder(stream);
|
||||
this.mediaRecorder.ondataavailable = (e) => {
|
||||
if (e.data && e.data.size > 0) {
|
||||
this.audioChunks.push(e.data);
|
||||
}
|
||||
};
|
||||
|
||||
this.stream = stream;
|
||||
} else {
|
||||
console.warn("Media Decives will work only with SSL");
|
||||
return;
|
||||
}
|
||||
|
||||
this.audioChunks = [];
|
||||
|
||||
// this.recorder.addEventListener("dataavailable", (event) => {
|
||||
// this.audioChunks.push(event.data);
|
||||
// });
|
||||
|
||||
this.mediaRecorder.start(1000);
|
||||
this.listeningStatus = true;
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (!this.mediaRecorder || !this.listeningStatus) {
|
||||
return;
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
this.mediaRecorder!.addEventListener("stop", async () => {
|
||||
const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
|
||||
const llm = new ChatGPTApi();
|
||||
const transcription = await llm.transcription({ file: audioBlob });
|
||||
this.onTranscription(transcription);
|
||||
this.listeningStatus = false;
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.mediaRecorder!.stop();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export class WebTranscriptionApi extends SpeechApi {
|
||||
private listeningStatus = false;
|
||||
private recognitionInstance: any | null = null;
|
||||
|
||||
isListening = () => this.listeningStatus;
|
||||
|
||||
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||
super();
|
||||
if (isFirefox()) return;
|
||||
const SpeechRecognition =
|
||||
(window as any).SpeechRecognition ||
|
||||
(window as any).webkitSpeechRecognition;
|
||||
this.recognitionInstance = new SpeechRecognition();
|
||||
this.recognitionInstance.continuous = true;
|
||||
this.recognitionInstance.interimResults = true;
|
||||
this.recognitionInstance.lang = getSTTLang();
|
||||
if (transcriptionCallback) {
|
||||
this.onTranscriptionReceived(transcriptionCallback);
|
||||
}
|
||||
this.recognitionInstance.onresult = (event: any) => {
|
||||
const result = event.results[event.results.length - 1];
|
||||
if (result.isFinal) {
|
||||
this.onTranscription(result[0].transcript);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
this.listeningStatus = true;
|
||||
await this.recognitionInstance.start();
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
this.listeningStatus = false;
|
||||
await this.recognitionInstance.stop();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue