hophacksfall24/speech_to_text.py at main · Loldude0/hophacksfall24 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration

import base64
import io
import soundfile as sf
import re
import wave
import os

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

def correct_base64_string(base64_string: str) -> str:
    pattern = r'data:audio/[^;]*;base64,(.*)'
    match = re.search(pattern, base64_string)
    if match:
        print(match.group(1))
        return match.group(1)
    return base64_string

def base_64_to_audio(base_64_string: str, file_name: str, dst_dir: str) -> str:
    base_64_string = correct_base64_string(base_64_string)
    audio_bytes = base64.b64decode(base_64_string)
    with open(dst_dir, "wb") as f:
        f.write(audio_bytes)

    return dst_dir

def speech_to_text(audio_path: str) -> str:
    audio, sr = sf.read(audio_path)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return transcription[0]

if __name__ == "__main__":
    base_64_string = base64.b64encode(open("./test/output_audio.wav", "rb").read()).decode()
    print(speech_to_text("audio.wav"))