How to properly pass audio signal to asr QuartzNet without saving audio?
See original GitHub issueDescribe the bug
Used instructions from notebooks: https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Streaming_ASR.ipynb https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo.ipynb
For some reason, the recognized text differs from what the model produces by the boxed method through reading the saved files. But we don’t want to save the files after we’ve received the sound.
-
Can you please tell me how nemo models read sound from files?
-
Can anyone come across a solution on how to correctly read the bytes of an audio file on the stt service? If you pass it like this:
with open(filename, "rb") as fileb:
abon = base64.b64encode(fileb.read())
data = {
"audio_content": audio_content,
}
response = requests.post(
"http://localhost:8557/stt", json=data)
Steps/Code to reproduce bug
from scipy.io import wavfile
import scipy.signal
class AudioEncoder(json.JSONEncoder):
"""Класс сериализации семплированного аудио"""
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)
# Example without passing bytes
target_sr = 16000
sample_rate_hertz, samples = wavfile.read(filename)
number_of_samples = round(
len(samples) * float(target_sr) / sample_rate_hertz)
samples = scipy.signal.resample(samples, number_of_samples)
audio_content = base64.b64encode(samples).decode("utf-8")
data = {
"audio_content": json.dumps(audio_content, cls=AudioEncoder),
}
response = requests.post(
"http://localhost:8557/stt", json=data)
On service stt:
audio_content = base64.b64decode(request.audio.audio_content)
audio_signal = np.frombuffer(audio_content) # , dtype=np.float32
asr = ASR(16000)
asr.transcribe(audio_signal) # does not produce the same output as the box method
from audio_dataset import speech_collate_fn, AudioDataLayer
class ASR:
asr_model = _ASR_MODEL
beam_search_lm = _BEAM_SEARCH_LM
def __init__(self, sample_rate_hertz):
assert self.asr_model is not None, "Модель не инициализирована"
self.vocab = list(self.asr_model.decoder.vocabulary)
self.vocab.append("_")
self.sr = sample_rate_hertz
self.data_layer = AudioDataLayer(self.sr)
self.data_loader = DataLoader(
self.data_layer, batch_size=1, collate_fn=speech_collate_fn
)
self.prev_char = ""
def _get_batch_pred(self, signal):
self.data_layer.set_signal(signal)
batch = next(iter(self.data_loader))
audio_signal, audio_signal_len = batch
audio_signal, audio_signal_len = \
audio_signal.to(self.asr_model.device), \
audio_signal_len.to(self.asr_model.device)
log_probs, encoded_len, predictions = self.asr_model.forward(
input_signal=audio_signal, input_signal_length=audio_signal_len
)
return log_probs # .cpu().numpy()[0]
@staticmethod
def _decode_sequence_ctc_with_max_prob(logits, vocab):
s = ""
for i in range(logits.shape[0]):
s += vocab[np.argmax(logits[i])]
return s
def merge_decoded_sequence_from_ctc(self, sequence):
s_merged = ""
for i in range(len(sequence)):
if sequence[i] != self.prev_char:
self.prev_char = sequence[i]
if self.prev_char != "_":
s_merged += self.prev_char
return s_merged
@staticmethod
def _to_logits_tensor(logits):
logits = [torch.tensor(logit) for logit in logits]
log_probs_length = torch.tensor([logit.shape[0] for logit in logits])
logits_tensor = torch.nn.utils.rnn.pad_sequence(
logits, batch_first=True)
return logits_tensor, log_probs_length
def _infer_beam_search(self, logits_tensor, log_probs_length):
assert self.beam_search_lm is not None, "Модель не инициализирована"
hyps = list()
for j in range(logits_tensor.shape[0]):
hyps.append(
self.beam_search_lm.forward(
log_probs=logits_tensor[j].unsqueeze(0),
log_probs_length=log_probs_length[j].unsqueeze(0),
)[0][0][1]
)
return hyps
@torch.no_grad()
def transcribe(self, audio_signal, use_lm=False):
logits = self._get_batch_pred(audio_signal)
if use_lm:
logits_tensor, log_probs_length = self._to_logits_tensor(logits)
transcription = self._infer_beam_search(
logits_tensor, log_probs_length)[0]
else:
transcription_ctc = self._decode_sequence_ctc_with_max_prob(
logits.cpu().numpy()[0], self.vocab
)
transcription = self.merge_decoded_sequence_from_ctc(
transcription_ctc)
return transcription
audio_dataset.py
# audio_dataset.py
from nemo.core.classes import IterableDataset
import torch
# import numpy as np
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
from common_utils.log import get_logger
from typing import Tuple
logger = get_logger(__name__)
def speech_collate_fn(batch: Tuple[torch.FloatTensor, torch.LongTensor]):
"""collate batch of audio sig, audio len
Args:
batch (FloatTensor, LongTensor): A tuple of tensors of signal, signal lengths.
This collate func assumes the signals are 1d torch tensors (i.e. mono audio).
"""
_, audio_lengths = zip(*batch)
max_audio_len = 0
has_audio = audio_lengths[0] is not None
if has_audio:
max_audio_len = max(audio_lengths).item()
audio_signal = []
for sig, sig_len in batch:
assert sig.dim() == 1, "Ожидается монозвук"
if has_audio:
sig_len = sig_len.item()
if sig_len < max_audio_len:
pad = (0, max_audio_len - sig_len)
sig = torch.nn.functional.pad(sig, pad)
audio_signal.append(sig)
if has_audio:
audio_signal = torch.stack(audio_signal)
audio_lengths = torch.stack(audio_lengths)
else:
audio_signal, audio_lengths = None, None
return audio_signal, audio_lengths
# simple data layer to pass audio signal
class AudioDataLayer(IterableDataset):
@property
def output_types(self):
return {
"audio_signal": NeuralType(("B", "T"), AudioSignal(freq=self._sample_rate)),
"a_sig_length": NeuralType(tuple("B"), LengthsType()),
}
def __init__(self, sample_rate):
super().__init__()
self._sample_rate = sample_rate
self.output = True
def __iter__(self):
return self
def __next__(self):
if not self.output:
raise StopIteration
self.output = False
return torch.as_tensor(self.signal, dtype=torch.float32), torch.as_tensor(
self.signal_shape, dtype=torch.int64
)
def set_signal(self, signal):
# нормализуем сигнал в диапозон значений: [-1; 1]
self.signal = signal.astype('float32') / 32768.
self.signal_shape = self.signal.size
self.output = True
def __len__(self):
return 1
Expected behavior
I expect recognition vianemo_asr.models.EncDecCTCModel.restore_from(...).transcribe([file], batch_size=1)to match what we did.
Environment details
Run on CPU (docker image: python:3.7-slim)
- OS Linux
- PyTorch 1.10
- Python 3.7
Additional context
Add any other context about the problem here. Example: CPU model
Issue Analytics
- State:
- Created 2 years ago
- Comments:8

Top Related StackOverflow Question
Also, the code for buffered CTC decoding is available in better form here - https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/utils/streaming_utils.py
Make sure you are following similar steps to chunking, buffering, normalizing etc
Might be due to version, but probably not. Logits for CTC are just argmax of the forward pass, no fancy decoding.