""" Codec Augmentation via torchaudio This library provides codec augmentation techniques in torchaudio for enhanced audio data processing. For detailed guidance and usage examples, refer to the tutorial at: https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html Note: This code is compatible with FFmpeg as the torchaudio backend. When using FFmpeg2, the maximum number of samples for processing is limited to 16. Authors * Mirco Ravanelli 2023 """ import random import torch import torchaudio class CodecAugment(torch.nn.Module): """ Apply random audio codecs to input waveforms using torchaudio. This class provides an interface for applying codec augmentation techniques to audio data. Arguments --------- sample_rate: int The sample rate of the input waveform. Example ------- >>> waveform = torch.rand(4, 16000) >>> if torchaudio.list_audio_backends()[0] == 'ffmpeg': ... augmenter = CodecAugment(16000) ... output_waveform = augmenter(waveform) """ def __init__(self, sample_rate=16000): super().__init__() self.sample_rate = sample_rate self.available_format_encoders = [ ("wav", "pcm_mulaw"), ("mp3", None), ("g722", None), ] def apply_codec(self, waveform, format=None, encoder=None): """ Apply the selected audio codec. Arguments ---------- waveform: torch.Tensor Input waveform of shape `[batch, time]`. format: str The audio format to use (e.g., "wav", "mp3"). Default is None. encoder: str The encoder to use for the format (e.g., "opus", "vorbis"). Default is None. Returns --------- torch.Tensor: Coded version of the input waveform of shape `[batch, time]`. """ audio_effector = torchaudio.io.AudioEffector( format=format, encoder=encoder ) waveform_aug = audio_effector.apply( waveform.transpose(0, 1).to("cpu"), self.sample_rate ) return waveform_aug.transpose(0, 1).to(waveform.device) def forward(self, waveform): """ Apply a random audio codec from the available list. Arguments --------- waveform: torch.Tensor Input waveform of shape `[batch, time]`. Returns ------- torch.Tensor Coded version of the input waveform of shape `[batch, time]`. """ format, encoder = random.choice(self.available_format_encoders) return self.apply_codec(waveform, format=format, encoder=encoder)
Memory