import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import Audio

from dataset import AudioUtils
from utils import plot_label_distribution, plot_audio_waveform, plot_audio_spectogram


DATA_PATH = 'data'
METADATA_FILE = 'data/metadata_compiled.csv'


metadata_df = pd.read_csv(METADATA_FILE)


sample_count = metadata_df.shape[0]
labbled_count = metadata_df[metadata_df['status'].notna()].shape[0]
unlabelled_count = metadata_df[metadata_df['status'].isna()].shape[0]

print(f'Dataset contains {sample_count} samples in total. However out of those only {labbled_count} are labeled and {unlabelled_count} are unlabeled.')

Dataset contains 27550 samples in total. However out of those only 16224 are labeled and 11326 are unlabeled.


print('Columns in metadata file:', len(metadata_df.columns))
metadata_df.columns

Columns in metadata file: 51

Index(['uuid', 'datetime', 'cough_detected', 'SNR', 'latitude', 'longitude',
       'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status',
       'quality_1', 'cough_type_1', 'dyspnea_1', 'wheezing_1', 'stridor_1',
       'choking_1', 'congestion_1', 'nothing_1', 'diagnosis_1', 'severity_1',
       'quality_2', 'cough_type_2', 'dyspnea_2', 'wheezing_2', 'stridor_2',
       'choking_2', 'congestion_2', 'nothing_2', 'diagnosis_2', 'severity_2',
       'quality_3', 'cough_type_3', 'dyspnea_3', 'wheezing_3', 'stridor_3',
       'choking_3', 'congestion_3', 'nothing_3', 'diagnosis_3', 'severity_3',
       'quality_4', 'cough_type_4', 'dyspnea_4', 'wheezing_4', 'stridor_4',
       'choking_4', 'congestion_4', 'nothing_4', 'diagnosis_4', 'severity_4'],
      dtype='object')


metadata_df['status'].value_counts()

healthy        12479
symptomatic     2590
COVID-19        1155
Name: status, dtype: int64


plot_label_distribution(metadata_df['status'].value_counts(), title='Distribution of samples in dataset')


healty_sample_uuid = random.choice(metadata_df[metadata_df['status'] == 'healthy']['uuid'].to_list())
signal, sample_rate, info = AudioUtils.load_audio_file(healty_sample_uuid, DATA_PATH)

print(f'Loaded sample with sample rate {sample_rate} Hz and duration {info.num_frames / info.sample_rate:.2f} seconds.')

plot_audio_waveform(signal, sample_rate, title='Audio waveform of healthy sample')

Audio(signal, rate=sample_rate, embed=True)

Loaded sample with sample rate 48000 Hz and duration 8.70 seconds.


symptomatic_sample_uuid = random.choice(metadata_df[metadata_df['status'] == 'symptomatic']['uuid'].to_list())
signal, sample_rate, info = AudioUtils.load_audio_file(symptomatic_sample_uuid, DATA_PATH)

print(f'Loaded sample with sample rate {sample_rate} Hz and duration {info.num_frames / info.sample_rate:.2f} seconds.')

plot_audio_waveform(signal, sample_rate, title='Audio waveform of symptomatic sample')

Audio(signal, rate=sample_rate, embed=True)

Loaded sample with sample rate 48000 Hz and duration 5.70 seconds.


covid_sample_uuid = random.choice(metadata_df[metadata_df['status'] == 'COVID-19']['uuid'].to_list())
signal, sample_rate, info = AudioUtils.load_audio_file(covid_sample_uuid, DATA_PATH)

print(f'Loaded sample with sample rate {sample_rate} Hz and duration {info.num_frames / info.sample_rate:.2f} seconds.')

plot_audio_waveform(signal, sample_rate, title='Audio waveform of COVID-19 sample')

Audio(signal, rate=sample_rate, embed=True)

Loaded sample with sample rate 48000 Hz and duration 9.90 seconds.


COUGH_DETECTED_THRESHOLD = 0.6
filtered_df = metadata_df[metadata_df['cough_detected'] > COUGH_DETECTED_THRESHOLD]

plot_label_distribution(filtered_df['status'].value_counts(), title='Distribution of samples in dataset')


covid_samples_count = filtered_df[filtered_df['status'] == 'COVID-19'].shape[0]

equalized_df = pd.concat([
    filtered_df[filtered_df['status'] == 'healthy'].sample(n=covid_samples_count, random_state=42),
    filtered_df[filtered_df['status'] == 'symptomatic'].sample(n=covid_samples_count, random_state=42),
    filtered_df[filtered_df['status'] == 'COVID-19'],
])

equalized_df['status'].value_counts()

healthy        756
symptomatic    756
COVID-19       756
Name: status, dtype: int64


plot_label_distribution(equalized_df['status'].value_counts(), title='Distribution of samples in dataset')


LENGTH = 10.0
SAMPLE_RATE = 16000

signal, sample_rate, info = AudioUtils.load_audio_file(covid_sample_uuid, DATA_PATH)

signal = AudioUtils.resample(signal, sample_rate, target_sample_rate=SAMPLE_RATE)
signal = AudioUtils.rechannel(signal, channels=1)

signal = AudioUtils.resize(signal, duration=LENGTH, sample_rate=SAMPLE_RATE)

print(f'Signal shape: {signal.shape}')

Signal shape: torch.Size([1, 160000])


NUM_MEL_BINS = 64
NUM_FFTS = 1024

spectrogram = AudioUtils.get_spectrogram(signal, sample_rate=SAMPLE_RATE, n_mels=NUM_MEL_BINS, n_fft=NUM_FFTS)
print(f'Spectrogram shape: {spectrogram.shape}')

plot_audio_spectogram(spectrogram, LENGTH, title='Spectrogram of healthy sample (dB)')

Spectrogram shape: torch.Size([1, 64, 313])


N_FREQ_MASKS = 3
N_TIME_MASKS = 2
MASKING_VAL = 'min'

masked_spectrogram = AudioUtils.spectrogram_masking(spectrogram, n_freq_masks=N_FREQ_MASKS, n_time_masks=N_TIME_MASKS, masking_val=MASKING_VAL)

plot_audio_spectogram(masked_spectrogram, LENGTH, title='Augmented spectrogram of healthy sample (dB)')


N_FREQ_MASKS = 3
N_TIME_MASKS = 2
MASKING_VAL = 'mean'

masked_spectrogram = AudioUtils.spectrogram_masking(spectrogram, n_freq_masks=N_FREQ_MASKS, n_time_masks=N_TIME_MASKS, masking_val=MASKING_VAL)

plot_audio_spectogram(masked_spectrogram, LENGTH, title='Augmented spectrogram of healthy sample (dB)')


PITCH_SHIFT_MAX_VALUE = 20

pitched_signal = AudioUtils.signal_pitch_shift(signal, sample_rate=SAMPLE_RATE, pitch_max_value=PITCH_SHIFT_MAX_VALUE)
pitched_spectrogram = AudioUtils.get_spectrogram(pitched_signal, sample_rate=SAMPLE_RATE, n_mels=NUM_MEL_BINS, n_fft=NUM_FFTS)

plot_audio_spectogram(pitched_spectrogram, LENGTH, title='Augmented spectrogram of healthy sample (dB)')


combined_singal = AudioUtils.signal_pitch_shift(signal, sample_rate=SAMPLE_RATE, pitch_max_value=PITCH_SHIFT_MAX_VALUE)
combined_spectrogram = AudioUtils.get_spectrogram(combined_singal, sample_rate=SAMPLE_RATE, n_mels=NUM_MEL_BINS, n_fft=NUM_FFTS)
combined_spectrogram = AudioUtils.spectrogram_masking(combined_spectrogram, n_freq_masks=N_FREQ_MASKS, n_time_masks=N_TIME_MASKS, masking_val=MASKING_VAL)

plot_audio_spectogram(combined_spectrogram, LENGTH, title='Augmented spectrogram of healthy sample (dB)')

Part 1: Explore the dataset through code¶

a. How many samples does the dataset contain?¶

b. How many classes? How many images per class? Show a histogram of the number of instances per class.¶

c. Play a random sample from each class.¶

Healthy sample¶

Symptomatic sample¶

COVID-19 sample¶

d. Describe if/how you think the data distribution will affect training of a classifier.¶

e. Decide what part of the dataset to use; all, some classes, some samples. Motivate your choice.¶

Appendix - Augmentation techniques¶

Spectrogram before augmentation¶

Applying time and frequency masking - `min` of the spectrogram¶

Applying time and frequency masking - `mean` of the spectrogram¶

Applying pitch shifting to the signal¶

Combining all augmentation techniques¶

Part 1: Explore the dataset through code¶

a. How many samples does the dataset contain?¶

b. How many classes? How many images per class? Show a histogram of the number of instances per class.¶

c. Play a random sample from each class.¶

Healthy sample¶

Symptomatic sample¶

COVID-19 sample¶

d. Describe if/how you think the data distribution will affect training of a classifier.¶

e. Decide what part of the dataset to use; all, some classes, some samples. Motivate your choice.¶

Appendix - Augmentation techniques¶

Spectrogram before augmentation¶

Applying time and frequency masking - min of the spectrogram¶

Applying time and frequency masking - mean of the spectrogram¶

Applying pitch shifting to the signal¶

Combining all augmentation techniques¶

Applying time and frequency masking - `min` of the spectrogram¶

Applying time and frequency masking - `mean` of the spectrogram¶