feat: add tts stt

2025-11-08 11:46:12 +08:00 · 2024-08-27 16:21:02 +08:00
parent 718782f5b1
commit 2f410fc09f
22 changed files with 1446 additions and 6 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -20,6 +20,7 @@ export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];

 export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
+export const TTSModels = ["tts-1", "tts-1-hd"] as const;
 export type ChatModel = ModelType;

 export interface MultimodalContent {
@@ -48,6 +49,25 @@ export interface LLMConfig {
  style?: DalleRequestPayload["style"];
 }

+export interface SpeechOptions {
+  model: string;
+  input: string;
+  voice: string;
+  response_format?: string;
+  speed?: number;
+  onController?: (controller: AbortController) => void;
+}
+
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
  messages: RequestMessage[];
  config: LLMConfig;
@@ -80,6 +100,8 @@ export interface LLMModelProvider {

 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -26,6 +26,8 @@ import {
  LLMModel,
  LLMUsage,
  MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -77,7 +79,7 @@ export interface DalleRequestPayload {
 export class ChatGPTApi implements LLMApi {
  private disableListModels = true;

-  path(path: string): string {
+  path(path: string, model?: string): string {
    const accessStore = useAccessStore.getState();

    let baseUrl = "";
@@ -140,6 +142,85 @@ export class ChatGPTApi implements LLMApi {
    return res.choices?.at(0)?.message?.content ?? res;
  }

+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    const requestPayload = {
+      model: options.model,
+      input: options.input,
+      voice: options.voice,
+      response_format: options.response_format,
+      speed: options.speed,
+    };
+
+    console.log("[Request] openai speech payload: ", requestPayload);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const speechPath = this.path(OpenaiPath.SpeechPath, options.model);
+      const speechPayload = {
+        method: "POST",
+        body: JSON.stringify(requestPayload),
+        signal: controller.signal,
+        headers: getHeaders(),
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+
+      const res = await fetch(speechPath, speechPayload);
+      clearTimeout(requestTimeoutId);
+      return await res.arrayBuffer();
+    } catch (e) {
+      console.log("[Request] failed to make a speech request", e);
+      throw e;
+    }
+  }
+
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
  async chat(options: ChatOptions) {
    const modelConfig = {
      ...useAppConfig.getState().modelConfig,