.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "tutorials/audio_data_augmentation_tutorial.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note Click :ref:`here ` to download the full example code .. rst-class:: sphx-glr-example-title .. _sphx_glr_tutorials_audio_data_augmentation_tutorial.py: Audio Data Augmentation ======================= ``torchaudio`` provides a variety of ways to augment audio data. .. GENERATED FROM PYTHON SOURCE LINES 8-16 .. code-block:: default import torch import torchaudio import torchaudio.functional as F print(torch.__version__) print(torchaudio.__version__) .. rst-class:: sphx-glr-script-out Out: .. code-block:: none 1.11.0+cpu 0.11.0+cpu .. GENERATED FROM PYTHON SOURCE LINES 17-20 Preparing data and utility functions (skip this section) -------------------------------------------------------- .. GENERATED FROM PYTHON SOURCE LINES 20-177 .. code-block:: default # @title Prepare data and utility functions. {display-mode: "form"} # @markdown # @markdown You do not need to look into this cell. # @markdown Just execute once and you are good to go. # @markdown # @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), # @markdown which is licensed under Creative Commos BY 4.0. # ------------------------------------------------------------------------------- # Preparation of data and helper functions. # ------------------------------------------------------------------------------- import math import os import matplotlib.pyplot as plt import requests from IPython.display import Audio, display _SAMPLE_DIR = "_assets" SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav" SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav") SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav" # noqa: E501 SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav") SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501 SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav" # noqa: E501 SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav") os.makedirs(_SAMPLE_DIR, exist_ok=True) def _fetch_data(): uri = [ (SAMPLE_WAV_URL, SAMPLE_WAV_PATH), (SAMPLE_RIR_URL, SAMPLE_RIR_PATH), (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH), ] for url, path in uri: with open(path, "wb") as file_: file_.write(requests.get(url).content) _fetch_data() def _get_sample(path, resample=None): effects = [["remix", "1"]] if resample: effects.extend( [ ["lowpass", f"{resample // 2}"], ["rate", f"{resample}"], ] ) return torchaudio.sox_effects.apply_effects_file(path, effects=effects) def get_sample(*, resample=None): return _get_sample(SAMPLE_WAV_PATH, resample=resample) def get_speech_sample(*, resample=None): return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): waveform = waveform.numpy() num_channels, num_frames = waveform.shape time_axis = torch.arange(0, num_frames) / sample_rate figure, axes = plt.subplots(num_channels, 1) if num_channels == 1: axes = [axes] for c in range(num_channels): axes[c].plot(time_axis, waveform[c], linewidth=1) axes[c].grid(True) if num_channels > 1: axes[c].set_ylabel(f"Channel {c+1}") if xlim: axes[c].set_xlim(xlim) if ylim: axes[c].set_ylim(ylim) figure.suptitle(title) plt.show(block=False) def print_stats(waveform, sample_rate=None, src=None): if src: print("-" * 10) print("Source:", src) print("-" * 10) if sample_rate: print("Sample Rate:", sample_rate) print("Shape:", tuple(waveform.shape)) print("Dtype:", waveform.dtype) print(f" - Max: {waveform.max().item():6.3f}") print(f" - Min: {waveform.min().item():6.3f}") print(f" - Mean: {waveform.mean().item():6.3f}") print(f" - Std Dev: {waveform.std().item():6.3f}") print() print(waveform) print() def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): waveform = waveform.numpy() num_channels, num_frames = waveform.shape figure, axes = plt.subplots(num_channels, 1) if num_channels == 1: axes = [axes] for c in range(num_channels): axes[c].specgram(waveform[c], Fs=sample_rate) if num_channels > 1: axes[c].set_ylabel(f"Channel {c+1}") if xlim: axes[c].set_xlim(xlim) figure.suptitle(title) plt.show(block=False) def play_audio(waveform, sample_rate): waveform = waveform.numpy() num_channels, num_frames = waveform.shape if num_channels == 1: return Audio(waveform[0], rate=sample_rate) elif num_channels == 2: return Audio((waveform[0], waveform[1]), rate=sample_rate) else: raise ValueError("Waveform with more than 2 channels are not supported.") def get_rir_sample(*, resample=None, processed=False): rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample) if not processed: return rir_raw, sample_rate rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)] rir = rir / torch.norm(rir, p=2) rir = torch.flip(rir, [1]) return rir, sample_rate def get_noise_sample(*, resample=None): return _get_sample(SAMPLE_NOISE_PATH, resample=resample) .. GENERATED FROM PYTHON SOURCE LINES 178-212 Applying effects and filtering ------------------------------ :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to those available in ``sox`` to Tensor objects and file object audio sources. There are two functions for this: - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects to Tensor. - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to other audio sources. Both functions accept effect definitions in the form ``List[List[str]]``. This is mostly consistent with how ``sox`` command works, but one caveat is that ``sox`` adds some effects automatically, whereas ``torchaudio``’s implementation does not. For the list of available effects, please refer to `the sox documentation `__. **Tip** If you need to load and resample your audio data on the fly, then you can use :py:func:`torchaudio.sox_effects.apply_effects_file` with effect ``"rate"``. **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a file-like object or path-like object. Similar to :py:func:`torchaudio.load`, when the audio format cannot be inferred from either the file extension or header, you can provide argument ``format`` to specify the format of the audio source. **Note** This process is not differentiable. .. GENERATED FROM PYTHON SOURCE LINES 212-233 .. code-block:: default # Load the data waveform1, sample_rate1 = get_sample(resample=16000) # Define effects effects = [ ["lowpass", "-1", "300"], # apply single-pole lowpass filter ["speed", "0.8"], # reduce the speed # This only changes sample rate, so it is necessary to # add `rate` effect with original sample rate after this. ["rate", f"{sample_rate1}"], ["reverb", "-w"], # Reverbration gives some dramatic feeling ] # Apply effects waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects) print_stats(waveform1, sample_rate=sample_rate1, src="Original") print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied") .. rst-class:: sphx-glr-script-out Out: .. code-block:: none ---------- Source: Original ---------- Sample Rate: 16000 Shape: (1, 39680) Dtype: torch.float32 - Max: 0.507 - Min: -0.448 - Mean: -0.000 - Std Dev: 0.122 tensor([[ 0.0007, 0.0076, 0.0122, ..., -0.0049, -0.0025, 0.0020]]) ---------- Source: Effects Applied ---------- Sample Rate: 16000 Shape: (2, 49600) Dtype: torch.float32 - Max: 0.091 - Min: -0.091 - Mean: -0.000 - Std Dev: 0.021 tensor([[0.0000, 0.0000, 0.0000, ..., 0.0069, 0.0058, 0.0045], [0.0000, 0.0000, 0.0000, ..., 0.0085, 0.0085, 0.0085]]) .. GENERATED FROM PYTHON SOURCE LINES 234-238 Note that the number of frames and number of channels are different from those of the original after the effects are applied. Let’s listen to the audio. .. GENERATED FROM PYTHON SOURCE LINES 240-243 Original: ~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 243-248 .. code-block:: default plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2)) plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) play_audio(waveform1, sample_rate1) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_001.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_001.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_002.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_002.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 249-252 Effects applied: ~~~~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 252-257 .. code-block:: default plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2)) plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) play_audio(waveform2, sample_rate2) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_003.png :alt: Effects Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_003.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_004.png :alt: Effects Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_004.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 258-260 Doesn’t it sound more dramatic? .. GENERATED FROM PYTHON SOURCE LINES 262-277 Simulating room reverberation ----------------------------- `Convolution reverb `__ is a technique that's used to make clean audio sound as though it has been produced in a different environment. Using Room Impulse Response (RIR), for instance, we can make clean speech sound as though it has been uttered in a conference room. For this process, we need RIR data. The following data are from the VOiCES dataset, but you can record your own — just turn on your microphone and clap your hands. .. GENERATED FROM PYTHON SOURCE LINES 277-287 .. code-block:: default sample_rate = 8000 rir_raw, _ = get_rir_sample(resample=sample_rate) plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None) plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") play_audio(rir_raw, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_005.png :alt: Room Impulse Response (raw) :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_005.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_006.png :alt: Room Impulse Response (raw) :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_006.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 288-291 First, we need to clean up the RIR. We extract the main impulse, normalize the signal power, then flip along the time axis. .. GENERATED FROM PYTHON SOURCE LINES 291-299 .. code-block:: default rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)] rir = rir / torch.norm(rir, p=2) rir = torch.flip(rir, [1]) print_stats(rir) plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None) .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_007.png :alt: Room Impulse Response :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_007.png :class: sphx-glr-single-img .. rst-class:: sphx-glr-script-out Out: .. code-block:: none Shape: (1, 2320) Dtype: torch.float32 - Max: 0.395 - Min: -0.286 - Mean: -0.000 - Std Dev: 0.021 tensor([[-0.0052, -0.0076, -0.0071, ..., 0.0184, 0.0173, 0.0070]]) .. GENERATED FROM PYTHON SOURCE LINES 300-302 Then, we convolve the speech signal with the RIR filter. .. GENERATED FROM PYTHON SOURCE LINES 302-308 .. code-block:: default speech, _ = get_speech_sample(resample=sample_rate) speech_ = torch.nn.functional.pad(speech, (rir.shape[1] - 1, 0)) augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] .. GENERATED FROM PYTHON SOURCE LINES 309-312 Original: ~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 312-317 .. code-block:: default plot_waveform(speech, sample_rate, title="Original", ylim=None) plot_specgram(speech, sample_rate, title="Original") play_audio(speech, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_008.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_008.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_009.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_009.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 318-321 RIR applied: ~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 321-327 .. code-block:: default plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None) plot_specgram(augmented, sample_rate, title="RIR Applied") play_audio(augmented, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_010.png :alt: RIR Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_010.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_011.png :alt: RIR Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_011.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 328-340 Adding background noise ----------------------- To add background noise to audio data, you can simply add a noise Tensor to the Tensor representing the audio data. A common method to adjust the intensity of noise is changing the Signal-to-Noise Ratio (SNR). [`wikipedia `__] $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$ $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$ .. GENERATED FROM PYTHON SOURCE LINES 340-357 .. code-block:: default sample_rate = 8000 speech, _ = get_speech_sample(resample=sample_rate) noise, _ = get_noise_sample(resample=sample_rate) noise = noise[:, : speech.shape[1]] speech_power = speech.norm(p=2) noise_power = noise.norm(p=2) snr_dbs = [20, 10, 3] noisy_speeches = [] for snr_db in snr_dbs: snr = math.exp(snr_db / 10) scale = snr * noise_power / speech_power noisy_speeches.append((scale * speech + noise) / 2) .. GENERATED FROM PYTHON SOURCE LINES 358-361 Background noise: ~~~~~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 361-366 .. code-block:: default plot_waveform(noise, sample_rate, title="Background noise") plot_specgram(noise, sample_rate, title="Background noise") play_audio(noise, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_012.png :alt: Background noise :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_012.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_013.png :alt: Background noise :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_013.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 367-370 SNR 20 dB: ~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 370-376 .. code-block:: default snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0] plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") play_audio(noisy_speech, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_014.png :alt: SNR: 20 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_014.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_015.png :alt: SNR: 20 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_015.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 377-380 SNR 10 dB: ~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 380-386 .. code-block:: default snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1] plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") play_audio(noisy_speech, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_016.png :alt: SNR: 10 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_016.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_017.png :alt: SNR: 10 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_017.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 387-390 SNR 3 dB: ~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 390-396 .. code-block:: default snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2] plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") play_audio(noisy_speech, sample_rate) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_018.png :alt: SNR: 3 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_018.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_019.png :alt: SNR: 3 [dB] :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_019.png :class: sphx-glr-multi-img .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 397-405 Applying codec to Tensor object ------------------------------- :py:func:`torchaudio.functional.apply_codec` can apply codecs to a Tensor object. **Note** This process is not differentiable. .. GENERATED FROM PYTHON SOURCE LINES 405-423 .. code-block:: default waveform, sample_rate = get_speech_sample(resample=8000) plot_specgram(waveform, sample_rate, title="Original") configs = [ ({"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, "8 bit mu-law"), ({"format": "gsm"}, "GSM-FR"), ({"format": "mp3", "compression": -9}, "MP3"), ({"format": "vorbis", "compression": -1}, "Vorbis"), ] waveforms = [] for param, title in configs: augmented = F.apply_codec(waveform, sample_rate, **param) plot_specgram(augmented, sample_rate, title=title) waveforms.append(augmented) .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_020.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_020.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_021.png :alt: 8 bit mu-law :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_021.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_022.png :alt: GSM-FR :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_022.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_023.png :alt: MP3 :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_023.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_024.png :alt: Vorbis :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_024.png :class: sphx-glr-multi-img .. GENERATED FROM PYTHON SOURCE LINES 424-427 Original: ~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 427-430 .. code-block:: default play_audio(waveform, sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 431-434 8 bit mu-law: ~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 434-437 .. code-block:: default play_audio(waveforms[0], sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 438-441 GSM-FR: ~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 441-444 .. code-block:: default play_audio(waveforms[1], sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 445-448 MP3: ~~~~ .. GENERATED FROM PYTHON SOURCE LINES 448-451 .. code-block:: default play_audio(waveforms[2], sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 452-455 Vorbis: ~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 455-458 .. code-block:: default play_audio(waveforms[3], sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 459-466 Simulating a phone recoding --------------------------- Combining the previous techniques, we can simulate audio that sounds like a person talking over a phone in a echoey room with people talking in the background. .. GENERATED FROM PYTHON SOURCE LINES 466-518 .. code-block:: default sample_rate = 16000 original_speech, _ = get_speech_sample(resample=sample_rate) plot_specgram(original_speech, sample_rate, title="Original") # Apply RIR rir, _ = get_rir_sample(resample=sample_rate, processed=True) speech_ = torch.nn.functional.pad(original_speech, (rir.shape[1] - 1, 0)) rir_applied = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] plot_specgram(rir_applied, sample_rate, title="RIR Applied") # Add background noise # Because the noise is recorded in the actual environment, we consider that # the noise contains the acoustic feature of the environment. Therefore, we add # the noise after RIR application. noise, _ = get_noise_sample(resample=sample_rate) noise = noise[:, : rir_applied.shape[1]] snr_db = 8 scale = math.exp(snr_db / 10) * noise.norm(p=2) / rir_applied.norm(p=2) bg_added = (scale * rir_applied + noise) / 2 plot_specgram(bg_added, sample_rate, title="BG noise added") # Apply filtering and change sample rate filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( bg_added, sample_rate, effects=[ ["lowpass", "4000"], [ "compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05", ], ["rate", "8000"], ], ) plot_specgram(filtered, sample_rate2, title="Filtered") # Apply telephony codec codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm") plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied") .. rst-class:: sphx-glr-horizontal * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_025.png :alt: Original :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_025.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_026.png :alt: RIR Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_026.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_027.png :alt: BG noise added :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_027.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_028.png :alt: Filtered :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_028.png :class: sphx-glr-multi-img * .. image-sg:: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_029.png :alt: GSM Codec Applied :srcset: /tutorials/images/sphx_glr_audio_data_augmentation_tutorial_029.png :class: sphx-glr-multi-img .. GENERATED FROM PYTHON SOURCE LINES 519-522 Original speech: ~~~~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 522-525 .. code-block:: default play_audio(original_speech, sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 526-529 RIR applied: ~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 529-532 .. code-block:: default play_audio(rir_applied, sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 533-536 Background noise added: ~~~~~~~~~~~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 536-539 .. code-block:: default play_audio(bg_added, sample_rate) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 540-543 Filtered: ~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 543-546 .. code-block:: default play_audio(filtered, sample_rate2) .. raw:: html


.. GENERATED FROM PYTHON SOURCE LINES 547-550 Codec aplied: ~~~~~~~~~~~~~ .. GENERATED FROM PYTHON SOURCE LINES 550-552 .. code-block:: default play_audio(codec_applied, sample_rate2) .. raw:: html


.. rst-class:: sphx-glr-timing **Total running time of the script:** ( 0 minutes 7.864 seconds) .. _sphx_glr_download_tutorials_audio_data_augmentation_tutorial.py: .. only :: html .. container:: sphx-glr-footer :class: sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: audio_data_augmentation_tutorial.py ` .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: audio_data_augmentation_tutorial.ipynb ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_