french-listening-demo/tts.ts at main · CodeWithOz/french-listening-demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os from 'os';
import { ChatGoogle } from '@langchain/google-webauth';
import { GeminiTTSAudioContent, ProcessedSentenceResult } from './types';
import { HarmBlockThreshold, HarmCategory } from '@google/genai';
import { HumanMessage } from '@langchain/core/messages';
import { pcmBufferToMp3 } from './ffmpeg';

const PRESENTERS = {
  MARIE: {
    name: 'Marie',
    voiceId: 'Sulafat',
  },
  CLEMENT: {
    name: 'Clément',
    voiceId: 'Puck',
  },
} as const;

/**
 * Audio tags used in the script along with their explanations
 */
const AUDIO_TAGS = [
  {
    tag: '[short pause]',
    explanation: 'speaker inserts a brief pause, around 250ms',
  },
  {
    tag: '[cheerfully]',
    explanation: 'speaker speaks in a cheerful, upbeat manner',
  },
  {
    tag: '[warmly]',
    explanation: 'speaker speaks with warmth and friendliness',
  },
  {
    tag: '[inhales deeply]',
    explanation: 'speaker takes a deep breath before speaking',
  },
  {
    tag: '[very slowly for emphasis]',
    explanation:
      'speaker speaks very slowly to emphasize the words that follow',
  },
  {
    tag: '[English explanation]',
    explanation:
      'speaker uses a strong yet understandable French accent to say the English text that follows',
  },
];

export function generateExplanationScript(
  processedSentence: ProcessedSentenceResult
): string[] {
  const { sentence, translatedText, verbs, vocabulary, idiomaticExpressions } =
    processedSentence;
  const explanationParts: string[] = [];

  // start by reading out the sentence
  explanationParts.push(
    `${PRESENTERS.MARIE.name}: [cheerfully] Bonjour à tous et à toutes ! Aujourd'hui, nous allons apprendre à traduire la phrase suivante, dans trois, deux, un [short pause]: « [very slowly for emphasis] ${sentence} ».`
  );

  // pass to the other speaker
  explanationParts.push(
    `${PRESENTERS.MARIE.name}: [short pause] [warmly] Alors, Clément, qu'est-ce que tu penses de cette phrase ?`
  );

  // then read out the translation
  explanationParts.push(
    `${PRESENTERS.CLEMENT.name}: [warmly] Bien, cette phrase veut dire [short pause]: "[English explanation] ${translatedText}". [short pause] And what can you tell us about the verbs used in this sentence, Marie?`
  );

  if (verbs && verbs.length > 0) {
    const infinitives = verbs.map(v => `"${v.infinitive}"`);
    const conjugatedVerbs = verbs.map(v => `"${v.conjugatedForm}"`);
    const meanings = verbs.map(
      v => `"[English explanation] ${replaceSlash(v.meaning)}"`
    );

    let verbsText = 'Les verbes utilisés ici sont ';
    if (infinitives.length === 1) {
      verbsText = 'Le verbe utilisé ici est ';
    }

    const verbsWithMeanings = conjugatedVerbs.map(
      (conjugatedVerb, index) =>
        `${conjugatedVerb} qui est conjugué de ${infinitives[index]} qui signifie ${meanings[index]}`
    );
    verbsText += formatList(verbsWithMeanings) + '.';

    explanationParts.push(`${PRESENTERS.MARIE.name}: [warmly] ${verbsText}`);
  }

  if (vocabulary.length > 0) {
    const vocabParts: string[] = [];
    vocabParts.push(
      `${PRESENTERS.CLEMENT.name}: [warmly] C'est vraiment intéressant! Ensuite, voici des autres mots que vous devez connaître [inhales deeply]:`
    );
    for (const vocab of vocabulary) {
      vocabParts.push(
        `"${vocab.word}" signifie "[English explanation] ${replaceSlash(
          vocab.meaning
        )}".`
      );
    }
    explanationParts.push(vocabParts.join(' '));
  }

  if (idiomaticExpressions && idiomaticExpressions.length > 0) {
    const idiomaticParts: string[] = [];
    idiomaticParts.push(
      `${PRESENTERS.MARIE.name}: [cheerfully] Bravo Clément! Enfin, il y a aussi des expressions idiomatiques que vous devez connaître [inhales deeply]:`
    );
    for (const idiom of idiomaticExpressions) {
      idiomaticParts.push(
        `"${
          idiom.expression
        }" est une expression idiomatique qui veut littéralement dire "[English explanation] ${replaceSlash(
          idiom.literalMeaning
        )}". ` +
          `Dans ce contexte, cela exprime l'idée de "[English explanation] ${replaceSlash(
            idiom.contextualMeaning
          )}" en anglais.`
      );
    }
    explanationParts.push(idiomaticParts.join(' '));
  }

  return explanationParts;
}

/**
 * Helper function to replace forward slashes with "or"
 */
function replaceSlash(text: string): string {
  // Replace all slash patterns with " or "
  return text.replace(/\s?\/\s?/g, ' or ');
}

/**
 * Helper function to format lists in French with proper conjunction
 */
function formatList(items: string[]): string {
  if (items.length === 0) return '';
  if (items.length === 1) return items[0];
  if (items.length === 2) return `${items[0]} et ${items[1]}`;

  const allButLast = items.slice(0, -1);
  const last = items[items.length - 1];
  return `${allButLast.join(', ')} et ${last}`;
}

/**
 * Generate multi-speaker dialogue using Google Gemini TTS
 */
export async function generateDialogue(
  script: string
): Promise<GeminiTTSAudioContent> {
  const apiKey = process.env.GOOGLE_API_KEY;

  if (!apiKey) {
    throw new Error('Google API key not configured');
  }

  try {
    const modelName = 'gemini-2.5-flash-preview-tts';
    const responseModalities = ['audio'];
    const speechConfig = [
      {
        speaker: PRESENTERS.MARIE.name,
        name: PRESENTERS.MARIE.voiceId,
      },
      {
        speaker: PRESENTERS.CLEMENT.name,
        name: PRESENTERS.CLEMENT.voiceId,
      },
    ];

    const model = new ChatGoogle({
      modelName,
      responseModalities,
      speechConfig,
      apiKey,
      safetySettings: [
        {
          category: HarmCategory.HARM_CATEGORY_HARASSMENT,
          threshold: HarmBlockThreshold.BLOCK_NONE,
        },
        {
          category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
          threshold: HarmBlockThreshold.BLOCK_NONE,
        },
        {
          category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
          threshold: HarmBlockThreshold.BLOCK_NONE,
        },
        {
          category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
          threshold: HarmBlockThreshold.BLOCK_NONE,
        },
      ],
    });

    const systemPrompt = getDialogueSystemPrompt();
    const humanMessageContent = `Here's the dialogue script inside triple backticks:\n\n\`\`\`\n${script}\n\`\`\`\n\nNow generate the AUDIO, NOT TEXT, for this dialogue script.`;
    const messages = [
      new HumanMessage(`${systemPrompt}\n\n${humanMessageContent}`),
    ];

    const result = await model.invoke(messages);

    const audioContent = result
      ?.content?.[0] as unknown as GeminiTTSAudioContent;
    if (!audioContent || !audioContent.data) {
      throw new Error('No audio data returned from Gemini TTS');
    }
    const mimeType = audioContent.mimeType;
    console.log(`[generateDialogue] mimeType: ${mimeType}`);
    const audioPcmData = audioContent.data;
    console.log(
      `[generateDialogue] audioPcmData: ${audioPcmData.length} bytes`
    );
    return audioContent;
  } catch (error) {
    console.error('Google Gemini TTS dialogue error:', error);
    throw new Error(`Failed to generate dialogue: ${error}`);
  }
}

/**
 * Generate system prompt for Gemini TTS dialogue generation
 */
export function getDialogueSystemPrompt(): string {
  const tagExplanations = AUDIO_TAGS.map(
    tag => `- ${tag.tag}: ${tag.explanation}`
  ).join('\n');

  return `You are a text-to-speech system that converts dialogue scripts into natural, expressive audio. That means your objective is to GENERATE ONLY AUDIO, NOT TEXT.

  You will receive a dialogue script between two speakers. They are French speakers who are translating French sentences, phrases, words, etc. into English.
  Your task is to read the provided dialogue script and convert it into speech, paying close attention to all audio direction tags and instructions.

  AUDIO TAGS:
  The script may contain square-bracketed tags that provide direction on how to speak certain parts. These tags indicate tone, emotion, pacing, or actions. Here are all the possible tags you may encounter:

  ${tagExplanations}

  INSTRUCTIONS:
  1. First and most importantly, GENERATE ONLY AUDIO, NOT TEXT.
  2. When you encounter an audio tag, interpret it and apply it to the following text until the next tag or end of that speaker's line, except when the tag is within quotes, in which case apply it to the text within the quotes
  3. Multiple tags can appear together (e.g., [inhales deeply] [warmly]); apply all of them appropriately
  4. Pay special attention to tags that modify pronunciation (e.g., [English explanation])
  5. Natural pauses should be indicated by punctuation or the context
  6. Speak naturally and expressively, matching the emotion and tone indicated by the tags
  7. Maintain the character and personality of each speaker consistently
  8. Lastly, GENERATE ONLY AUDIO, NOT TEXT.

  The dialogue script will be formatted as:
  {Speaker Name}: {their dialogue text with tags}
  {Another Speaker}: {their dialogue text with tags}
  ... and so on.

  Now generate the audio for the following dialogue script.

  IMPORTANT: Remember to GENERATE ONLY AUDIO, NOT TEXT.`;
}

export async function convertAudioContentToMp3(
  audioContent: GeminiTTSAudioContent,
  opts?: { workDir?: string; baseName?: string }
): Promise<string> {
  if (!audioContent || !audioContent.data) {
    throw new Error('No audio content provided');
  }

  const mimeType = audioContent.mimeType;
  const audioPcmData = audioContent.data;
  const pcmBuffer = Buffer.from(audioPcmData, 'base64');

  // Parse MIME type like: audio/L16;codec=pcm;rate=24000
  let parsedSampleRate = 24000;
  let parsedBitDepth = 16;
  if (typeof mimeType === 'string') {
    const rateMatch = mimeType.match(/(?:^|[;\s])rate=(\d+)/i);
    const depthMatch = mimeType.match(/^audio\/L(\d+)/i);
    if (rateMatch)
      parsedSampleRate = parseInt(rateMatch[1], 10) || parsedSampleRate;
    if (depthMatch)
      parsedBitDepth = parseInt(depthMatch[1], 10) || parsedBitDepth;
  }
  const outputDir = opts?.workDir || os.tmpdir();
  const baseName = opts?.baseName || `dialogue_${Date.now()}`;
  await pcmBufferToMp3(pcmBuffer, outputDir, baseName, {
    sampleRate: parsedSampleRate,
    channels: 1,
    bitrate: '192k',
    codec: 'libmp3lame',
    bitDepth: parsedBitDepth,
  });
  console.log(
    `[convertAudioContentToMp3] MP3 written to ${outputDir}/${baseName}.mp3 (${audioContent.data.length} bytes)`
  );

  return `${outputDir}/${baseName}.mp3`;
}