diff --git a/app/client/api.ts b/app/client/api.ts index c39504139..a44741d93 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -53,6 +53,16 @@ export interface SpeechOptions { onController?: (controller: AbortController) => void; } +export interface TranscriptionOptions { + model?: "whisper-1"; + file: Blob; + language?: string; + prompt?: string; + response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt"; + temperature?: number; + onController?: (controller: AbortController) => void; +} + export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -94,6 +104,7 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; abstract speech(options: SpeechOptions): Promise; + abstract transcription(options: TranscriptionOptions): Promise; abstract toolAgentChat(options: AgentChatOptions): Promise; abstract usage(): Promise; abstract models(): Promise; diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index 65e609a6e..d18b8966b 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -7,6 +7,7 @@ import { LLMModel, LLMUsage, SpeechOptions, + TranscriptionOptions, } from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import { getClientConfig } from "@/app/config/client"; @@ -18,6 +19,9 @@ import { } from "@/app/utils"; export class GeminiProApi implements LLMApi { + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index 4f939323f..243caef61 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -18,6 +18,7 @@ import { LLMUsage, MultimodalContent, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -124,6 +125,47 @@ export class ChatGPTApi implements LLMApi { } } + async transcription(options: TranscriptionOptions): Promise { + const formData = new FormData(); + formData.append("file", options.file, "audio.wav"); + formData.append("model", options.model ?? "whisper-1"); + if (options.language) formData.append("language", options.language); + if (options.prompt) formData.append("prompt", options.prompt); + if (options.response_format) + formData.append("response_format", options.response_format); + if (options.temperature) + formData.append("temperature", options.temperature.toString()); + + console.log("[Request] openai audio transcriptions payload: ", options); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const path = this.path(OpenaiPath.TranscriptionPath, options.model); + const payload = { + method: "POST", + body: formData, + signal: controller.signal, + headers: getHeaders(), + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + + const res = await fetch(path, payload); + clearTimeout(requestTimeoutId); + const json = await res.json(); + return json.text; + } catch (e) { + console.log("[Request] failed to make a audio transcriptions request", e); + throw e; + } + } + async chat(options: ChatOptions) { const visionModel = isVisionModel(options.config.model); const messages = options.messages.map((v) => ({ diff --git a/app/components/chat.tsx b/app/components/chat.tsx index dffefefc9..0321f9d7f 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -108,6 +108,11 @@ import { useAllModels } from "../utils/hooks"; import { ClientApi } from "../client/api"; import { createTTSPlayer } from "../utils/audio"; import { MultimodalContent } from "../client/api"; +import { + OpenAITranscriptionApi, + SpeechApi, + WebTranscriptionApi, +} from "../utils/speech"; const ttsPlayer = createTTSPlayer(); @@ -801,17 +806,19 @@ function _Chat() { }; const [isListening, setIsListening] = useState(false); - const [recognition, setRecognition] = useState(null); - const startListening = () => { - if (recognition) { - recognition.start(); + const [speechApi, setSpeechApi] = useState(null); + + const startListening = async () => { + console.log(speechApi); + if (speechApi) { + await speechApi.start(); setIsListening(true); } }; - const stopListening = () => { - if (recognition) { - recognition.stop(); + const stopListening = async () => { + if (speechApi) { + await speechApi.stop(); setIsListening(false); } }; @@ -891,26 +898,11 @@ function _Chat() { } }); // eslint-disable-next-line react-hooks/exhaustive-deps - if (typeof window !== "undefined") { - const SpeechRecognition = - (window as any).SpeechRecognition || - (window as any).webkitSpeechRecognition; - const recognitionInstance = new SpeechRecognition(); - recognitionInstance.continuous = true; - recognitionInstance.interimResults = true; - let lang = getSTTLang(); - recognitionInstance.lang = lang; - recognitionInstance.onresult = (event: any) => { - const result = event.results[event.results.length - 1]; - if (result.isFinal) { - if (!isListening) { - onRecognitionEnd(result[0].transcript); - } - } - }; - - setRecognition(recognitionInstance); - } + setSpeechApi( + new WebTranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ), + ); }, []); // check if should send message @@ -1700,7 +1692,9 @@ function _Chat() { } className={styles["chat-input-send"]} type="primary" - onClick={() => (isListening ? stopListening() : startListening())} + onClick={async () => + isListening ? await stopListening() : await startListening() + } /> ) : ( void; + +export abstract class SpeechApi { + protected onTranscription: TranscriptionCallback = () => {}; + + abstract isListening(): boolean; + abstract start(): Promise; + abstract stop(): Promise; + + onTranscriptionReceived(callback: TranscriptionCallback) { + this.onTranscription = callback; + } + + protected async getMediaStream(): Promise { + if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { + return await navigator.mediaDevices.getUserMedia({ audio: true }); + } else if (navigator.getUserMedia) { + return new Promise((resolve, reject) => { + navigator.getUserMedia({ audio: true }, resolve, reject); + }); + } else { + console.warn("当前浏览器不支持 getUserMedia"); + return null; + } + } + + protected createRecorder(stream: MediaStream): MediaRecorder | null { + if (MediaRecorder.isTypeSupported("audio/webm")) { + return new MediaRecorder(stream, { mimeType: "audio/webm" }); + } else if (MediaRecorder.isTypeSupported("audio/ogg")) { + return new MediaRecorder(stream, { mimeType: "audio/ogg" }); + } else { + console.warn("当前浏览器不支持 MediaRecorder"); + return null; + } + } +} + +export class OpenAITranscriptionApi extends SpeechApi { + private listeningStatus = false; + private recorder: MediaRecorder | null = null; + private audioChunks: Blob[] = []; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback) { + super(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + } + + async start(): Promise { + const stream = await this.getMediaStream(); + if (!stream) { + console.error("无法获取音频流"); + return; + } + + this.recorder = this.createRecorder(stream); + if (!this.recorder) { + console.error("无法创建 MediaRecorder"); + return; + } + + this.audioChunks = []; + + this.recorder.addEventListener("dataavailable", (event) => { + this.audioChunks.push(event.data); + }); + + this.recorder.start(); + this.listeningStatus = true; + } + + async stop(): Promise { + if (!this.recorder || !this.listeningStatus) { + return; + } + + return new Promise((resolve) => { + this.recorder!.addEventListener("stop", async () => { + const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" }); + const llm = new ChatGPTApi(); + const transcription = await llm.transcription({ file: audioBlob }); + this.onTranscription(transcription); + this.listeningStatus = false; + resolve(); + }); + + this.recorder!.stop(); + }); + } +} + +export class WebTranscriptionApi extends SpeechApi { + private listeningStatus = false; + private recognitionInstance: any | null = null; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback) { + super(); + const SpeechRecognition = + (window as any).SpeechRecognition || + (window as any).webkitSpeechRecognition; + this.recognitionInstance = new SpeechRecognition(); + this.recognitionInstance.continuous = true; + this.recognitionInstance.interimResults = true; + this.recognitionInstance.lang = getSTTLang(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + this.recognitionInstance.onresult = (event: any) => { + const result = event.results[event.results.length - 1]; + if (result.isFinal) { + if (!this.isListening) { + this.onTranscriptionReceived(result[0].transcript); + } + } + }; + } + + async start(): Promise { + await this.recognitionInstance.start(); + this.listeningStatus = true; + } + + async stop(): Promise { + await this.recognitionInstance.stop(); + this.listeningStatus = false; + } +}