diff --git a/app/client/api.ts b/app/client/api.ts
index c39504139..a44741d93 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -53,6 +53,16 @@ export interface SpeechOptions {
   onController?: (controller: AbortController) => void;
 }
 
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
   messages: RequestMessage[];
   config: LLMConfig;
@@ -94,6 +104,7 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
   abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
   abstract toolAgentChat(options: AgentChatOptions): Promise<void>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts
index 65e609a6e..d18b8966b 100644
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -7,6 +7,7 @@ import {
   LLMModel,
   LLMUsage,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 import { getClientConfig } from "@/app/config/client";
@@ -18,6 +19,9 @@ import {
 } from "@/app/utils";
 
 export class GeminiProApi implements LLMApi {
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
   speech(options: SpeechOptions): Promise<ArrayBuffer> {
     throw new Error("Method not implemented.");
   }
diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
index 4f939323f..243caef61 100644
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -18,6 +18,7 @@ import {
   LLMUsage,
   MultimodalContent,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -124,6 +125,47 @@ export class ChatGPTApi implements LLMApi {
     }
   }
 
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: getHeaders(),
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
   async chat(options: ChatOptions) {
     const visionModel = isVisionModel(options.config.model);
     const messages = options.messages.map((v) => ({
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index dffefefc9..0321f9d7f 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -108,6 +108,11 @@ import { useAllModels } from "../utils/hooks";
 import { ClientApi } from "../client/api";
 import { createTTSPlayer } from "../utils/audio";
 import { MultimodalContent } from "../client/api";
+import {
+  OpenAITranscriptionApi,
+  SpeechApi,
+  WebTranscriptionApi,
+} from "../utils/speech";
 
 const ttsPlayer = createTTSPlayer();
 
@@ -801,17 +806,19 @@ function _Chat() {
   };
 
   const [isListening, setIsListening] = useState(false);
-  const [recognition, setRecognition] = useState<any>(null);
-  const startListening = () => {
-    if (recognition) {
-      recognition.start();
+  const [speechApi, setSpeechApi] = useState<any>(null);
+
+  const startListening = async () => {
+    console.log(speechApi);
+    if (speechApi) {
+      await speechApi.start();
       setIsListening(true);
     }
   };
 
-  const stopListening = () => {
-    if (recognition) {
-      recognition.stop();
+  const stopListening = async () => {
+    if (speechApi) {
+      await speechApi.stop();
       setIsListening(false);
     }
   };
@@ -891,26 +898,11 @@ function _Chat() {
       }
     });
     // eslint-disable-next-line react-hooks/exhaustive-deps
-    if (typeof window !== "undefined") {
-      const SpeechRecognition =
-        (window as any).SpeechRecognition ||
-        (window as any).webkitSpeechRecognition;
-      const recognitionInstance = new SpeechRecognition();
-      recognitionInstance.continuous = true;
-      recognitionInstance.interimResults = true;
-      let lang = getSTTLang();
-      recognitionInstance.lang = lang;
-      recognitionInstance.onresult = (event: any) => {
-        const result = event.results[event.results.length - 1];
-        if (result.isFinal) {
-          if (!isListening) {
-            onRecognitionEnd(result[0].transcript);
-          }
-        }
-      };
-
-      setRecognition(recognitionInstance);
-    }
+    setSpeechApi(
+      new WebTranscriptionApi((transcription) =>
+        onRecognitionEnd(transcription),
+      ),
+    );
   }, []);
 
   // check if should send message
@@ -1700,7 +1692,9 @@ function _Chat() {
               }
               className={styles["chat-input-send"]}
               type="primary"
-              onClick={() => (isListening ? stopListening() : startListening())}
+              onClick={async () =>
+                isListening ? await stopListening() : await startListening()
+              }
             />
           ) : (
             <IconButton
diff --git a/app/constant.ts b/app/constant.ts
index ec81b189f..d50cfff71 100644
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -80,6 +80,7 @@ export enum ModelProvider {
 export const OpenaiPath = {
   ChatPath: "v1/chat/completions",
   SpeechPath: "v1/audio/speech",
+  TranscriptionPath: "v1/audio/transcriptions",
   UsagePath: "dashboard/billing/usage",
   SubsPath: "dashboard/billing/subscription",
   ListModelPath: "v1/models",
diff --git a/app/utils/speech.ts b/app/utils/speech.ts
new file mode 100644
index 000000000..3defc5f38
--- /dev/null
+++ b/app/utils/speech.ts
@@ -0,0 +1,136 @@
+import { ChatGPTApi } from "../client/platforms/openai";
+import { getSTTLang } from "../locales";
+
+export type TranscriptionCallback = (transcription: string) => void;
+
+export abstract class SpeechApi {
+  protected onTranscription: TranscriptionCallback = () => {};
+
+  abstract isListening(): boolean;
+  abstract start(): Promise<void>;
+  abstract stop(): Promise<void>;
+
+  onTranscriptionReceived(callback: TranscriptionCallback) {
+    this.onTranscription = callback;
+  }
+
+  protected async getMediaStream(): Promise<MediaStream | null> {
+    if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
+      return await navigator.mediaDevices.getUserMedia({ audio: true });
+    } else if (navigator.getUserMedia) {
+      return new Promise((resolve, reject) => {
+        navigator.getUserMedia({ audio: true }, resolve, reject);
+      });
+    } else {
+      console.warn("当前浏览器不支持 getUserMedia");
+      return null;
+    }
+  }
+
+  protected createRecorder(stream: MediaStream): MediaRecorder | null {
+    if (MediaRecorder.isTypeSupported("audio/webm")) {
+      return new MediaRecorder(stream, { mimeType: "audio/webm" });
+    } else if (MediaRecorder.isTypeSupported("audio/ogg")) {
+      return new MediaRecorder(stream, { mimeType: "audio/ogg" });
+    } else {
+      console.warn("当前浏览器不支持 MediaRecorder");
+      return null;
+    }
+  }
+}
+
+export class OpenAITranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private recorder: MediaRecorder | null = null;
+  private audioChunks: Blob[] = [];
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+  }
+
+  async start(): Promise<void> {
+    const stream = await this.getMediaStream();
+    if (!stream) {
+      console.error("无法获取音频流");
+      return;
+    }
+
+    this.recorder = this.createRecorder(stream);
+    if (!this.recorder) {
+      console.error("无法创建 MediaRecorder");
+      return;
+    }
+
+    this.audioChunks = [];
+
+    this.recorder.addEventListener("dataavailable", (event) => {
+      this.audioChunks.push(event.data);
+    });
+
+    this.recorder.start();
+    this.listeningStatus = true;
+  }
+
+  async stop(): Promise<void> {
+    if (!this.recorder || !this.listeningStatus) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+      this.recorder!.addEventListener("stop", async () => {
+        const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
+        const llm = new ChatGPTApi();
+        const transcription = await llm.transcription({ file: audioBlob });
+        this.onTranscription(transcription);
+        this.listeningStatus = false;
+        resolve();
+      });
+
+      this.recorder!.stop();
+    });
+  }
+}
+
+export class WebTranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private recognitionInstance: any | null = null;
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    const SpeechRecognition =
+      (window as any).SpeechRecognition ||
+      (window as any).webkitSpeechRecognition;
+    this.recognitionInstance = new SpeechRecognition();
+    this.recognitionInstance.continuous = true;
+    this.recognitionInstance.interimResults = true;
+    this.recognitionInstance.lang = getSTTLang();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+    this.recognitionInstance.onresult = (event: any) => {
+      const result = event.results[event.results.length - 1];
+      if (result.isFinal) {
+        if (!this.isListening) {
+          this.onTranscriptionReceived(result[0].transcript);
+        }
+      }
+    };
+  }
+
+  async start(): Promise<void> {
+    await this.recognitionInstance.start();
+    this.listeningStatus = true;
+  }
+
+  async stop(): Promise<void> {
+    await this.recognitionInstance.stop();
+    this.listeningStatus = false;
+  }
+}