CS-753-Project/espnet_decoder.py at main · soham2109/CS-753-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import time
import string
import math
import configparser

import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text

config = configparser.ConfigParser()
config.read("config.ini")

Fs=int(config.get("espnet","fs"))
TAG=config.get("espnet","tag_libri")
LM_TRAIN_CONFIG=config.get("espnet","lm_train_config")
LM_FILE=config.get("espnet","lm_file")
ASR_CONFIG_FILE=config.get("espnet","asr_train_config")
ASR_MODEL_FILE=config.get("espnet","asr_model_file")
DURATION=int(config.get("streamlit","duration"))

class ESPNet_Decoder:

	def __init__(self):
		#	d=ModelDownloader("dataset/") # Run this for the first time
		self.speech2text = Speech2Text(# **d.download_and_unpack(TAG),
								  asr_train_config=ASR_CONFIG_FILE,
								  asr_model_file=ASR_MODEL_FILE,
								  lm_train_config=LM_TRAIN_CONFIG,
								  lm_file=LM_FILE,
								  device="cpu",
								  minlenratio=0.0,
								  maxlenratio=0.0,
								  ctc_weight=0.5,
								  beam_size=4,
								  batch_size=0,
								  nbest=1)
		print("Model is loaded")

	def text_normalizer(self, text):
		text = text.lower()
		return text.translate(str.maketrans('', '', string.punctuation))


	def decode(self, audio_path, sr=Fs):
		print("Model is loaded. The path to the audio file is {}".format(audio_path))
		data, _ = librosa.load(audio_path, sr=sr)
		data = data*10
		start = time.time()
		complete_text=[]
		# number of data_samples in 10 secs = Fs*10
		# total #10sec_chunks = (total_data_points)/(Fs*10)
		num_chunks = math.ceil(len(data)/(Fs*DURATION))
		for i in range(num_chunks):
			nbests = self.speech2text(data[i*Fs*DURATION:(i+1)*Fs*DURATION])
			text, *_ = nbests[0]
			complete_text.append(self.text_normalizer(text))

		decoding_time = time.time()-start
		decoded_text = " ".join(complete_text)
		return decoded_text, decoding_time


if __name__=="__main__":
	audio_path = "audio.wav"
	decoder = ESPNet_Decoder()
	decoded_text, decoding_time, _ = decoder.decode(audio_path)
	print("The text decoded is:")
	print(decoded_text)
	print("Time taken to decode: {}s".format(decoding_time))