minimal-azure-openai-express-html-with-streaming/public/browser-realtime-webrtc.js at main · Paul-Borisov/minimal-azure-openai-express-html-with-streaming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
(async () => {
  async function startRealtimeSession({
    model,
    prompt,
    chatHistory,
    thinkingHeader,
    getFormattedChatHistory,
    getFormattedOutput,
    updateRealtimeRootInnerHtml
  }) {
    let userVoiceTranscript = prompt;
    let formattedChatHistory = getFormattedChatHistory();
    let formattedUserRequest = getFormattedOutput(prompt, false);
    const rawOutput = [];
    updateRealtimeRootInnerHtml(formattedChatHistory, formattedUserRequest, [thinkingHeader]);

    const isGAModel = /^(gpt-realtime|gpt-5-)/i.test(model); // Newer GA models use different URLs and start parameters
    const tokenResponse = await fetch(`/api/openai/session?model=${model}`);
    const data = await tokenResponse.json();
    const EPHEMERAL_KEY = data.client_secret.value;

    // Create a peer connection
    let peerConnection = new RTCPeerConnection();

    // Set up to play remote audio from the model
    const audio = document.createElement("audio");
    audio.autoplay = true;
    peerConnection.ontrack = (e) => (audio.srcObject = e.streams[0]);

    // Add local audio track for microphone input in the browser
    const mediaStream = await navigator.mediaDevices.getUserMedia({
      audio: true,
      text: true,
    });
    peerConnection.addTrack(mediaStream.getTracks()[0]);

    // Set up data channel for sending and receiving events
    let dataChannel = peerConnection.createDataChannel("openai-realtime-events");
    const userVoiceInput = [];
    dataChannel.addEventListener("message", (e) => {
      const realtimeEvent = JSON.parse(e.data);
      //console.log(realtimeEvent)
      switch(realtimeEvent.type) {
        case "conversation.item.created": {
          // Older event, which was used by gpt-4o-realtime-preview.
          // no break here to falldown to the next newer event.
        }
        case "conversation.item.done": {  // New event for gpt-preview
          console.log(`${realtimeEvent.item.role}`);
          if(realtimeEvent.item.role === "user") {
            rawOutput.length = 0;
            userVoiceInput.length = 0;
          }
          break;
        }
        case "conversation.item.input_audio_transcription.delta": {
          //console.log(`${realtimeEvent.delta}`);
          const userVoiceDelta = realtimeEvent.delta.trim();
          if(userVoiceInput.length || userVoiceDelta) {
            userVoiceInput.push(userVoiceDelta);
            formattedUserRequest = getFormattedOutput(userVoiceInput.join(""), false);
          }
          break;
        }
        case "conversation.item.input_audio_transcription.completed": {
          //console.log(`${realtimeEvent.transcript}`);
          userVoiceTranscript = realtimeEvent.transcript.trim();
          break;
        }
        case "response.audio_transcript.delta": {
          //console.log(`${realtimeEvent.delta}`);
          if(realtimeEvent.delta) {
            rawOutput.push(realtimeEvent.delta);
            updateRealtimeRootInnerHtml(formattedChatHistory, formattedUserRequest, rawOutput);
          }
          break;
        }
        case "response.output_audio_transcript.delta": {
          //console.log(`${realtimeEvent.delta}`);
          if(realtimeEvent.delta) {
            rawOutput.push(realtimeEvent.delta);
            updateRealtimeRootInnerHtml(formattedChatHistory, formattedUserRequest, rawOutput);
          }
          break;
        }
        case "response.content_part.done": {
          //console.log("response.content_part.done", userVoiceTranscript)
          if(realtimeEvent.part.transcript) console.log(`${realtimeEvent.part.transcript}`);
          chatHistory.push({ role: "user", content: userVoiceTranscript });
          chatHistory.push({ role: "assistant", content: rawOutput.join("") });
          //console.log(chatHistory);
          formattedChatHistory = getFormattedChatHistory();
          break;
        }
        default: {
          //console.log(realtimeEvent);
        }
      }
    });

    // Start the session using the Session Description Protocol (SDP)
    const offer = await peerConnection.createOffer();
    await peerConnection.setLocalDescription(offer);

    let baseUrl = "https://api.openai.com/v1/realtime";
    if (isGAModel) baseUrl += "/calls";
    const sdpResponse = await fetch(`${baseUrl}?model=${model}`, {
      method: "POST",
      body: offer.sdp,
      headers: {
        Authorization: `Bearer ${EPHEMERAL_KEY}`,
        "Content-Type": "application/sdp",
      },
    });

    const answer = {
      type: "answer",
      sdp: await sdpResponse.text(),
    };
    await peerConnection.setRemoteDescription(answer);

    const startConversationFromThePrompt = () => {
      const responseCreate = {
        type: "response.create",
        response: {
          modalities: ["text","audio"],
          instructions: prompt
        }
      };
      if (isGAModel) {
        delete responseCreate.response.modalities; // New GA models must not have this parameter specified
      }
      dataChannel.send(JSON.stringify(responseCreate));
      const event = {
        type: "conversation.item.create",
        item: {
          type: "message",
          role: "user",
          content: [
            {
              type: "input_text",
              text: prompt,
            },
          ],
        },
      };
      dataChannel.send(JSON.stringify(event));
    };

    const updateSession = () => {
      const audio_input_noise_reduction = { type: "far_field" };
      const audio_input_transcription = {
        model: "whisper-1",  // or gpt-4o-transcribe if enabled on your account
        // language: "en",      // optional
        // prompt: "domain terms, names, etc." // optional biasing prompt
      };
      const turn_detection = {
        type: "server_vad",
        threshold: 0.5,
        prefix_padding_ms: 300,
        silence_duration_ms: 500
      };
      const eventEnableUserTranscript = {
        type: "session.update",
        session: { model }
      };
      if (isGAModel) {
        eventEnableUserTranscript.session = {
          type: "realtime",
          ...eventEnableUserTranscript.session,
          audio: {
            input: {
              noise_reduction: audio_input_noise_reduction,
              transcription: audio_input_transcription,
              turn_detection
            }
          },
        }
      } else {
        // The older syntax, which was used for gpt-4o-realtime-preview
        // This is already set on the server side. Duplicated here just as an example.
        eventEnableUserTranscript.session = {
          ...eventEnableUserTranscript.session,
          input_audio_noise_reduction: audio_input_noise_reduction,
          input_audio_transcription: audio_input_transcription,
          turn_detection
        }
      }
      dataChannel.send(JSON.stringify(eventEnableUserTranscript));
    };
    dataChannel.addEventListener("open", () => {
      updateSession();
      startConversationFromThePrompt();
    });
    window.stopRealtimeSession = function() {
      if (dataChannel) dataChannel.close();
      if (peerConnection) peerConnection.close();
      dataChannel = null;
      peerConnection = null;
    };
  }
  window.startRealtimeSession = startRealtimeSession;
})();