mirror of
https://github.com/Yidadaa/ChatGPT-Next-Web.git
synced 2025-08-08 14:02:08 +08:00
feat: add tts stt
This commit is contained in:
45
app/utils/audio.ts
Normal file
45
app/utils/audio.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
type TTSPlayer = {
|
||||
init: () => void;
|
||||
play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
|
||||
stop: () => void;
|
||||
};
|
||||
|
||||
export function createTTSPlayer(): TTSPlayer {
|
||||
let audioContext: AudioContext | null = null;
|
||||
let audioBufferSourceNode: AudioBufferSourceNode | null = null;
|
||||
|
||||
const init = () => {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
audioContext.suspend();
|
||||
};
|
||||
|
||||
const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
}
|
||||
|
||||
const buffer = await audioContext!.decodeAudioData(audioBuffer);
|
||||
audioBufferSourceNode = audioContext!.createBufferSource();
|
||||
audioBufferSourceNode.buffer = buffer;
|
||||
audioBufferSourceNode.connect(audioContext!.destination);
|
||||
audioContext!.resume().then(() => {
|
||||
audioBufferSourceNode!.start();
|
||||
});
|
||||
audioBufferSourceNode.onended = onended;
|
||||
};
|
||||
|
||||
const stop = () => {
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
audioBufferSourceNode = null;
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext = null;
|
||||
}
|
||||
};
|
||||
|
||||
return { init, play, stop };
|
||||
}
|
391
app/utils/ms_edge_tts.ts
Normal file
391
app/utils/ms_edge_tts.ts
Normal file
@@ -0,0 +1,391 @@
|
||||
// import axios from "axios";
|
||||
import { Buffer } from "buffer";
|
||||
import { randomBytes } from "crypto";
|
||||
import { Readable } from "stream";
|
||||
|
||||
// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
|
||||
|
||||
/**
|
||||
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
|
||||
*/
|
||||
export enum VOLUME {
|
||||
SILENT = "silent",
|
||||
X_SOFT = "x-soft",
|
||||
SOFT = "soft",
|
||||
MEDIUM = "medium",
|
||||
LOUD = "loud",
|
||||
X_LOUD = "x-LOUD",
|
||||
DEFAULT = "default",
|
||||
}
|
||||
|
||||
/**
|
||||
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
|
||||
*/
|
||||
export enum RATE {
|
||||
X_SLOW = "x-slow",
|
||||
SLOW = "slow",
|
||||
MEDIUM = "medium",
|
||||
FAST = "fast",
|
||||
X_FAST = "x-fast",
|
||||
DEFAULT = "default",
|
||||
}
|
||||
|
||||
/**
|
||||
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
|
||||
*/
|
||||
export enum PITCH {
|
||||
X_LOW = "x-low",
|
||||
LOW = "low",
|
||||
MEDIUM = "medium",
|
||||
HIGH = "high",
|
||||
X_HIGH = "x-high",
|
||||
DEFAULT = "default",
|
||||
}
|
||||
|
||||
/**
|
||||
* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
|
||||
*/
|
||||
export enum OUTPUT_FORMAT {
|
||||
// Streaming =============================
|
||||
// AMR_WB_16000HZ = "amr-wb-16000hz",
|
||||
// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
|
||||
// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
|
||||
// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
|
||||
// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
|
||||
// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
|
||||
// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
|
||||
AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
|
||||
AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
|
||||
// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
|
||||
// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
|
||||
// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
|
||||
// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
|
||||
// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
|
||||
// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
|
||||
// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
|
||||
// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
|
||||
// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
|
||||
// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
|
||||
// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
|
||||
// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
|
||||
// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
|
||||
// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
|
||||
// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
|
||||
// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
|
||||
// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
|
||||
// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
|
||||
WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
|
||||
// Non-streaming =============================
|
||||
// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
|
||||
// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
|
||||
// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
|
||||
// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
|
||||
// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
|
||||
// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
|
||||
// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
|
||||
}
|
||||
|
||||
export type Voice = {
|
||||
Name: string;
|
||||
ShortName: string;
|
||||
Gender: string;
|
||||
Locale: string;
|
||||
SuggestedCodec: string;
|
||||
FriendlyName: string;
|
||||
Status: string;
|
||||
};
|
||||
|
||||
export class ProsodyOptions {
|
||||
/**
|
||||
* The pitch to use.
|
||||
* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
|
||||
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
|
||||
*/
|
||||
pitch?: PITCH | string = "+0Hz";
|
||||
/**
|
||||
* The rate to use.
|
||||
* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
|
||||
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
|
||||
*/
|
||||
rate?: RATE | string | number = 1.0;
|
||||
/**
|
||||
* The volume to use.
|
||||
* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
|
||||
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
|
||||
*/
|
||||
volume?: VOLUME | string | number = 100.0;
|
||||
}
|
||||
|
||||
export class MsEdgeTTS {
|
||||
static OUTPUT_FORMAT = OUTPUT_FORMAT;
|
||||
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
|
||||
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
||||
private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
||||
private static BINARY_DELIM = "Path:audio\r\n";
|
||||
private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
|
||||
private readonly _enableLogger;
|
||||
private _ws: WebSocket | undefined;
|
||||
private _voice: any;
|
||||
private _voiceLocale: any;
|
||||
private _outputFormat: any;
|
||||
private _streams: { [key: string]: Readable } = {};
|
||||
private _startTime = 0;
|
||||
|
||||
private _log(...o: any[]) {
|
||||
if (this._enableLogger) {
|
||||
console.log(...o);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new `MsEdgeTTS` instance.
|
||||
*
|
||||
* @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
|
||||
* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
|
||||
*/
|
||||
public constructor(enableLogger: boolean = false) {
|
||||
this._enableLogger = enableLogger;
|
||||
}
|
||||
|
||||
private async _send(message: any) {
|
||||
for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
|
||||
if (i == 1) {
|
||||
this._startTime = Date.now();
|
||||
}
|
||||
this._log("connecting: ", i);
|
||||
await this._initClient();
|
||||
}
|
||||
this._ws!.send(message);
|
||||
}
|
||||
|
||||
private _initClient() {
|
||||
this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
|
||||
|
||||
this._ws.binaryType = "arraybuffer";
|
||||
return new Promise((resolve, reject) => {
|
||||
this._ws!.onopen = () => {
|
||||
this._log(
|
||||
"Connected in",
|
||||
(Date.now() - this._startTime) / 1000,
|
||||
"seconds",
|
||||
);
|
||||
this._send(
|
||||
`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
|
||||
{
|
||||
"context": {
|
||||
"synthesis": {
|
||||
"audio": {
|
||||
"metadataoptions": {
|
||||
"sentenceBoundaryEnabled": "false",
|
||||
"wordBoundaryEnabled": "false"
|
||||
},
|
||||
"outputFormat": "${this._outputFormat}"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
`,
|
||||
).then(resolve);
|
||||
};
|
||||
this._ws!.onmessage = (m: any) => {
|
||||
const buffer = Buffer.from(m.data as ArrayBuffer);
|
||||
const message = buffer.toString();
|
||||
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
|
||||
if (message.includes("Path:turn.start")) {
|
||||
// start of turn, ignore
|
||||
} else if (message.includes("Path:turn.end")) {
|
||||
// end of turn, close stream
|
||||
this._streams[requestId].push(null);
|
||||
} else if (message.includes("Path:response")) {
|
||||
// context response, ignore
|
||||
} else if (
|
||||
message.includes("Path:audio") &&
|
||||
m.data instanceof ArrayBuffer
|
||||
) {
|
||||
this._pushAudioData(buffer, requestId);
|
||||
} else {
|
||||
this._log("UNKNOWN MESSAGE", message);
|
||||
}
|
||||
};
|
||||
this._ws!.onclose = () => {
|
||||
this._log(
|
||||
"disconnected after:",
|
||||
(Date.now() - this._startTime) / 1000,
|
||||
"seconds",
|
||||
);
|
||||
for (const requestId in this._streams) {
|
||||
this._streams[requestId].push(null);
|
||||
}
|
||||
};
|
||||
this._ws!.onerror = function (error: any) {
|
||||
reject("Connect Error: " + error);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
private _pushAudioData(audioBuffer: Buffer, requestId: string) {
|
||||
const audioStartIndex =
|
||||
audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
|
||||
MsEdgeTTS.BINARY_DELIM.length;
|
||||
const audioData = audioBuffer.subarray(audioStartIndex);
|
||||
this._streams[requestId].push(audioData);
|
||||
this._log("received audio chunk, size: ", audioData?.length);
|
||||
}
|
||||
|
||||
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
|
||||
// in case future updates to the edge API block these elements, we'll be concatenating strings.
|
||||
options = { ...new ProsodyOptions(), ...options };
|
||||
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
|
||||
<voice name="${this._voice}">
|
||||
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
|
||||
${input}
|
||||
</prosody>
|
||||
</voice>
|
||||
</speak>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the list of voices available in Microsoft Edge.
|
||||
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
|
||||
*/
|
||||
// getVoices(): Promise<Voice[]> {
|
||||
// return new Promise((resolve, reject) => {
|
||||
// axios
|
||||
// .get(MsEdgeTTS.VOICES_URL)
|
||||
// .then((res) => resolve(res.data))
|
||||
// .catch(reject);
|
||||
// });
|
||||
// }
|
||||
getVoices(): Promise<Voice[]> {
|
||||
return fetch(MsEdgeTTS.VOICES_URL)
|
||||
.then((response) => {
|
||||
if (!response.ok) {
|
||||
throw new Error("Network response was not ok");
|
||||
}
|
||||
return response.json();
|
||||
})
|
||||
.then((data) => data as Voice[])
|
||||
.catch((error) => {
|
||||
throw error;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
|
||||
* Must be called at least once before text can be synthesised.
|
||||
* Saved in this instance. Can be called at any time times to update the metadata.
|
||||
*
|
||||
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
|
||||
* @param outputFormat any {@link OUTPUT_FORMAT}
|
||||
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
|
||||
*/
|
||||
async setMetadata(
|
||||
voiceName: string,
|
||||
outputFormat: OUTPUT_FORMAT,
|
||||
voiceLocale?: string,
|
||||
) {
|
||||
const oldVoice = this._voice;
|
||||
const oldVoiceLocale = this._voiceLocale;
|
||||
const oldOutputFormat = this._outputFormat;
|
||||
|
||||
this._voice = voiceName;
|
||||
this._voiceLocale = voiceLocale;
|
||||
if (!this._voiceLocale) {
|
||||
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
|
||||
if (!voiceLangMatch)
|
||||
throw new Error("Could not infer voiceLocale from voiceName!");
|
||||
this._voiceLocale = voiceLangMatch[0];
|
||||
}
|
||||
this._outputFormat = outputFormat;
|
||||
|
||||
const changed =
|
||||
oldVoice !== this._voice ||
|
||||
oldVoiceLocale !== this._voiceLocale ||
|
||||
oldOutputFormat !== this._outputFormat;
|
||||
|
||||
// create new client
|
||||
if (changed || this._ws!.readyState !== this._ws!.OPEN) {
|
||||
this._startTime = Date.now();
|
||||
await this._initClient();
|
||||
}
|
||||
}
|
||||
|
||||
private _metadataCheck() {
|
||||
if (!this._ws)
|
||||
throw new Error(
|
||||
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the WebSocket connection.
|
||||
*/
|
||||
close() {
|
||||
this._ws!.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
|
||||
*
|
||||
* @param input the text to synthesise. Can include SSML elements.
|
||||
* @param options (optional) {@link ProsodyOptions}
|
||||
* @returns {Readable} - a `stream.Readable` with the audio data
|
||||
*/
|
||||
toStream(input: string, options?: ProsodyOptions): Readable {
|
||||
const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
|
||||
return stream;
|
||||
}
|
||||
|
||||
toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
|
||||
return new Promise((resolve, reject) => {
|
||||
let data: Uint8Array[] = [];
|
||||
const readable = this.toStream(input, options);
|
||||
readable.on("data", (chunk) => {
|
||||
data.push(chunk);
|
||||
});
|
||||
|
||||
readable.on("end", () => {
|
||||
resolve(Buffer.concat(data).buffer);
|
||||
});
|
||||
|
||||
readable.on("error", (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
|
||||
*
|
||||
* @param requestSSML the SSML to send. SSML elements required in order to work.
|
||||
* @returns {Readable} - a `stream.Readable` with the audio data
|
||||
*/
|
||||
rawToStream(requestSSML: string): Readable {
|
||||
const { stream } = this._rawSSMLRequest(requestSSML);
|
||||
return stream;
|
||||
}
|
||||
|
||||
private _rawSSMLRequest(requestSSML: string): {
|
||||
stream: Readable;
|
||||
requestId: string;
|
||||
} {
|
||||
this._metadataCheck();
|
||||
|
||||
const requestId = randomBytes(16).toString("hex");
|
||||
const request =
|
||||
`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
|
||||
` + requestSSML.trim();
|
||||
// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
|
||||
const self = this;
|
||||
const stream = new Readable({
|
||||
read() {},
|
||||
destroy(error: Error | null, callback: (error: Error | null) => void) {
|
||||
delete self._streams[requestId];
|
||||
callback(error);
|
||||
},
|
||||
});
|
||||
this._streams[requestId] = stream;
|
||||
this._send(request).then();
|
||||
return { stream, requestId };
|
||||
}
|
||||
}
|
126
app/utils/speech.ts
Normal file
126
app/utils/speech.ts
Normal file
@@ -0,0 +1,126 @@
|
||||
import { ChatGPTApi } from "../client/platforms/openai";
|
||||
import { getSTTLang } from "../locales";
|
||||
import { isFirefox } from "../utils";
|
||||
|
||||
export type TranscriptionCallback = (transcription: string) => void;
|
||||
|
||||
export abstract class SpeechApi {
|
||||
protected onTranscription: TranscriptionCallback = () => {};
|
||||
|
||||
abstract isListening(): boolean;
|
||||
abstract start(): Promise<void>;
|
||||
abstract stop(): Promise<void>;
|
||||
|
||||
onTranscriptionReceived(callback: TranscriptionCallback) {
|
||||
this.onTranscription = callback;
|
||||
}
|
||||
}
|
||||
|
||||
export class OpenAITranscriptionApi extends SpeechApi {
|
||||
private listeningStatus = false;
|
||||
private mediaRecorder: MediaRecorder | null = null;
|
||||
private stream: MediaStream | null = null;
|
||||
private audioChunks: Blob[] = [];
|
||||
|
||||
isListening = () => this.listeningStatus;
|
||||
|
||||
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||
super();
|
||||
if (transcriptionCallback) {
|
||||
this.onTranscriptionReceived(transcriptionCallback);
|
||||
}
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
// @ts-ignore
|
||||
navigator.getUserMedia =
|
||||
// @ts-ignore
|
||||
navigator.getUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.webkitGetUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.mozGetUserMedia ||
|
||||
// @ts-ignore
|
||||
navigator.msGetUserMedia;
|
||||
if (navigator.mediaDevices) {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
this.mediaRecorder = new MediaRecorder(stream);
|
||||
this.mediaRecorder.ondataavailable = (e) => {
|
||||
if (e.data && e.data.size > 0) {
|
||||
this.audioChunks.push(e.data);
|
||||
}
|
||||
};
|
||||
|
||||
this.stream = stream;
|
||||
} else {
|
||||
console.warn("Media Decives will work only with SSL");
|
||||
return;
|
||||
}
|
||||
|
||||
this.audioChunks = [];
|
||||
|
||||
// this.recorder.addEventListener("dataavailable", (event) => {
|
||||
// this.audioChunks.push(event.data);
|
||||
// });
|
||||
|
||||
this.mediaRecorder.start(1000);
|
||||
this.listeningStatus = true;
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (!this.mediaRecorder || !this.listeningStatus) {
|
||||
return;
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
this.mediaRecorder!.addEventListener("stop", async () => {
|
||||
const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
|
||||
const llm = new ChatGPTApi();
|
||||
const transcription = await llm.transcription({ file: audioBlob });
|
||||
this.onTranscription(transcription);
|
||||
this.listeningStatus = false;
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.mediaRecorder!.stop();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export class WebTranscriptionApi extends SpeechApi {
|
||||
private listeningStatus = false;
|
||||
private recognitionInstance: any | null = null;
|
||||
|
||||
isListening = () => this.listeningStatus;
|
||||
|
||||
constructor(transcriptionCallback?: TranscriptionCallback) {
|
||||
super();
|
||||
if (isFirefox()) return;
|
||||
const SpeechRecognition =
|
||||
(window as any).SpeechRecognition ||
|
||||
(window as any).webkitSpeechRecognition;
|
||||
this.recognitionInstance = new SpeechRecognition();
|
||||
this.recognitionInstance.continuous = true;
|
||||
this.recognitionInstance.interimResults = true;
|
||||
this.recognitionInstance.lang = getSTTLang();
|
||||
if (transcriptionCallback) {
|
||||
this.onTranscriptionReceived(transcriptionCallback);
|
||||
}
|
||||
this.recognitionInstance.onresult = (event: any) => {
|
||||
const result = event.results[event.results.length - 1];
|
||||
if (result.isFinal) {
|
||||
this.onTranscription(result[0].transcript);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
this.listeningStatus = true;
|
||||
await this.recognitionInstance.start();
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
this.listeningStatus = false;
|
||||
await this.recognitionInstance.stop();
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user