python/load_MITBIH.py

#!/usr/bin/env python

"""
load_MITBIH.py

Download .csv files and annotations from:
    kaggle.com/mondejar/mitbih-database

VARPA, University of Coruna
Mondejar Guerra, Victor M.
23 Oct 2017
"""

import os
import csv
import gc
import cPickle as pickle

import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import medfilt
import scipy.stats
import pywt
import time
import sklearn
from sklearn import decomposition
from sklearn.decomposition import PCA, IncrementalPCA

from features_ECG import *

from numpy.polynomial.hermite import hermfit, hermval


def create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag):
    
    features_labels_name = db_path + 'features/' + 'w_' + str(winL) + '_' + str(winR) + '_' + DS 

    if do_preprocess:
        features_labels_name += '_rm_bsline'

    if maxRR:
        features_labels_name += '_maxRR'

    if use_RR:
        features_labels_name += '_RR'
    
    if norm_RR:
        features_labels_name += '_norm_RR'

    for descp in compute_morph:
        features_labels_name += '_' + descp

    if reduced_DS:
        features_labels_name += '_reduced'
        
    if leads_flag[0] == 1:
        features_labels_name += '_MLII'

    if leads_flag[1] == 1:
        features_labels_name += '_V1'

    features_labels_name += '.p'

    return features_labels_name


def save_wvlt_PCA(PCA, pca_k, family, level):
    f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'wb')
    pickle.dump(PCA, f, 2)
    f.close


def load_wvlt_PCA(pca_k, family, level):
    f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'rb')
    # disable garbage collector       
    gc.disable()# this improve the required loading time!
    PCA = pickle.load(f)
    gc.enable()
    f.close()

    return PCA
# Load the data with the configuration and features selected
# Params:
# - leads_flag = [MLII, V1] set the value to 0 or 1 to reference if that lead is used
# - reduced_DS = load DS1, DS2 patients division (Chazal) or reduced version, 
#                i.e., only patients in common that contains both MLII and V1 
def load_mit_db(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag):

    features_labels_name = create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) 

    if os.path.isfile(features_labels_name):
        print("Loading pickle: " + features_labels_name + "...")
        f = open(features_labels_name, 'rb')
        # disable garbage collector       
        gc.disable()# this improve the required loading time!
        features, labels, patient_num_beats = pickle.load(f)
        gc.enable()
        f.close()


    else:
        print("Loading MIT BIH arr (" + DS + ") ...")

        # ML-II
        if reduced_DS == False:
            DS1 = [101, 106, 108, 109, 112, 114, 115, 116, 118, 119, 122, 124, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230]
            DS2 = [100, 103, 105, 111, 113, 117, 121, 123, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234]

        # ML-II + V1
        else:
            DS1 = [101, 106, 108, 109, 112, 115, 118, 119, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230]
            DS2 = [105, 111, 113, 121, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234]

        mit_pickle_name = db_path + 'python_mit'
        if reduced_DS:
            mit_pickle_name = mit_pickle_name + '_reduced_'

        if do_preprocess:
            mit_pickle_name = mit_pickle_name + '_rm_bsline'

        mit_pickle_name = mit_pickle_name + '_wL_' + str(winL) + '_wR_' + str(winR)
        mit_pickle_name = mit_pickle_name + '_' + DS + '.p'

        # If the data with that configuration has been already computed Load pickle
        if os.path.isfile(mit_pickle_name):
            f = open(mit_pickle_name, 'rb')
            # disable garbage collector       
            gc.disable()# this improve the required loading time!
            my_db = pickle.load(f)
            gc.enable()
            f.close()
        else: # Load data and compute de RR features 
            if DS == 'DS1':
                my_db = load_signal(DS1, winL, winR, do_preprocess)
            else:
                my_db = load_signal(DS2, winL, winR, do_preprocess)

            print("Saving signal processed data ...")
            # Save data
            # Protocol version 0 itr_features_balanceds the original ASCII protocol and is backwards compatible with earlier versions of Python.
            # Protocol version 1 is the old binary format which is also compatible with earlier versions of Python.
            # Protocol version 2 was introduced in Python 2.3. It provides much more efficient pickling of new-style classes.
            f = open(mit_pickle_name, 'wb')
            pickle.dump(my_db, f, 2)
            f.close

        
        features = np.array([], dtype=float)
        labels = np.array([], dtype=np.int32)

        # This array contains the number of beats for each patient (for cross_val)
        patient_num_beats = np.array([], dtype=np.int32)
        for p in range(len(my_db.beat)):
            patient_num_beats = np.append(patient_num_beats, len(my_db.beat[p]))

        # Compute RR features
        if use_RR or norm_RR:
            if DS == 'DS1':
                RR = [RR_intervals() for i in range(len(DS1))]
            else:
                RR = [RR_intervals() for i in range(len(DS2))]

            print("Computing RR intervals ...")

            for p in range(len(my_db.beat)):
                if maxRR:
                    RR[p] = compute_RR_intervals(my_db.R_pos[p])
                else:
                    RR[p] = compute_RR_intervals(my_db.orig_R_pos[p])
                    
                RR[p].pre_R = RR[p].pre_R[(my_db.valid_R[p] == 1)]
                RR[p].post_R = RR[p].post_R[(my_db.valid_R[p] == 1)]
                RR[p].local_R = RR[p].local_R[(my_db.valid_R[p] == 1)]
                RR[p].global_R = RR[p].global_R[(my_db.valid_R[p] == 1)]


        if use_RR:
            f_RR = np.empty((0,4))
            for p in range(len(RR)):
                row = np.column_stack((RR[p].pre_R, RR[p].post_R, RR[p].local_R, RR[p].global_R))
                f_RR = np.vstack((f_RR, row))

            features = np.column_stack((features, f_RR)) if features.size else f_RR
        
        if norm_RR:
            f_RR_norm = np.empty((0,4))
            for p in range(len(RR)):
                # Compute avg values!
                avg_pre_R = np.average(RR[p].pre_R)
                avg_post_R = np.average(RR[p].post_R)
                avg_local_R = np.average(RR[p].local_R)
                avg_global_R = np.average(RR[p].global_R)

                row = np.column_stack((RR[p].pre_R / avg_pre_R, RR[p].post_R / avg_post_R, RR[p].local_R / avg_local_R, RR[p].global_R / avg_global_R))
                f_RR_norm = np.vstack((f_RR_norm, row))

            features = np.column_stack((features, f_RR_norm))  if features.size else f_RR_norm

        #########################################################################################
        # Compute morphological features
        print("Computing morphological features (" + DS + ") ...")

        num_leads = np.sum(leads_flag)

        # Raw
        if 'resample_10' in compute_morph:
            print("Resample_10 ...")
            start = time.time()

            f_raw = np.empty((0, 10 * num_leads))

            for p in range(len(my_db.beat)):
                for beat in my_db.beat[p]:
                    f_raw_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            resamp_beat = scipy.signal.resample(beat[s], 10)
                            if f_raw_lead.size == 1:
                                f_raw_lead =  resamp_beat
                            else:
                                f_raw_lead = np.hstack((f_raw_lead, resamp_beat))
                    f_raw = np.vstack((f_raw, f_raw_lead))

            features = np.column_stack((features, f_raw))  if features.size else f_raw

            end = time.time()
            print("Time resample: " + str(format(end - start, '.2f')) + " sec" )

        if 'raw' in compute_morph:
            print("Raw ...")
            start = time.time()

            f_raw = np.empty((0, (winL + winR) * num_leads))

            for p in range(len(my_db.beat)):
                for beat in my_db.beat[p]:
                    f_raw_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_raw_lead.size == 1:
                                f_raw_lead =  beat[s]
                            else:
                                f_raw_lead = np.hstack((f_raw_lead, beat[s]))
                    f_raw = np.vstack((f_raw, f_raw_lead))

            features = np.column_stack((features, f_raw))  if features.size else f_raw

            end = time.time()
            print("Time raw: " + str(format(end - start, '.2f')) + " sec" )
        # LBP 1D
        # 1D-local binary pattern based feature extraction for classification of epileptic EEG signals: 2014, unas 55 citas, Q2-Q1 Matematicas
        # https://ac.els-cdn.com/S0096300314008285/1-s2.0-S0096300314008285-main.pdf?_tid=8a8433a6-e57f-11e7-98ec-00000aab0f6c&acdnat=1513772341_eb5d4d26addb6c0b71ded4fd6cc23ed5

        # 1D-LBP method, which derived from implementation steps of 2D-LBP, was firstly proposed by Chatlani et al. for detection of speech signals that is non-stationary in nature [23]

        # From Raw signal

        # TODO: Some kind of preprocesing or clean high frequency noise?

        # Compute 2 Histograms: LBP or Uniform LBP
        # LBP 8 = 0-255
        # U-LBP 8 = 0-58
        # Uniform LBP are only those pattern wich only presents 2 (or less) transitions from 0-1 or 1-0
        # All the non-uniform patterns are asigned to the same value in the histogram

        if 'u-lbp' in compute_morph:
            print("u-lbp ...")

            f_lbp = np.empty((0, 59 * num_leads))

            for p in range(len(my_db.beat)):
                for beat in my_db.beat[p]:
                    f_lbp_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_lbp_lead.size == 1:

                                f_lbp_lead = compute_Uniform_LBP(beat[s], 8)
                            else:
                                f_lbp_lead = np.hstack((f_lbp_lead, compute_Uniform_LBP(beat[s], 8)))
                    f_lbp = np.vstack((f_lbp, f_lbp_lead))

            features = np.column_stack((features, f_lbp))  if features.size else f_lbp
            print(features.shape)

        if 'lbp' in compute_morph:
            print("lbp ...")

            f_lbp = np.empty((0, 16 * num_leads))

            for p in range(len(my_db.beat)):
                for beat in my_db.beat[p]:
                    f_lbp_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_lbp_lead.size == 1:

                                f_lbp_lead = compute_LBP(beat[s], 4)
                            else:
                                f_lbp_lead = np.hstack((f_lbp_lead, compute_LBP(beat[s], 4)))
                    f_lbp = np.vstack((f_lbp, f_lbp_lead))

            features = np.column_stack((features, f_lbp))  if features.size else f_lbp
            print(features.shape)


        if 'hbf5' in compute_morph:
            print("hbf ...")

            f_hbf = np.empty((0, 15 * num_leads))

            for p in range(len(my_db.beat)):
                for beat in my_db.beat[p]:
                    f_hbf_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_hbf_lead.size == 1:

                                f_hbf_lead = compute_HBF(beat[s])
                            else:
                                f_hbf_lead = np.hstack((f_hbf_lead, compute_HBF(beat[s])))
                    f_hbf = np.vstack((f_hbf, f_hbf_lead))

            features = np.column_stack((features, f_hbf))  if features.size else f_hbf
            print(features.shape)

        # Wavelets
        if 'wvlt' in compute_morph:
            print("Wavelets ...")

            f_wav = np.empty((0, 23 * num_leads))

            for p in range(len(my_db.beat)):
                for b in my_db.beat[p]:
                    f_wav_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_wav_lead.size == 1:
                                f_wav_lead =  compute_wavelet_descriptor(b[s], 'db1', 3)
                            else:
                                f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], 'db1', 3)))
                    f_wav = np.vstack((f_wav, f_wav_lead))
                    #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b,  'db1', 3)))

            features = np.column_stack((features, f_wav))  if features.size else f_wav


        # Wavelets
        if 'wvlt+pca' in compute_morph:
            pca_k = 7
            print("Wavelets + PCA ("+ str(pca_k) + "...")
            
            family = 'db1'
            level = 3

            f_wav = np.empty((0, 23 * num_leads))

            for p in range(len(my_db.beat)):
                for b in my_db.beat[p]:
                    f_wav_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_wav_lead.size == 1:
                                f_wav_lead =  compute_wavelet_descriptor(b[s], family, level)
                            else:
                                f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], family, level)))
                    f_wav = np.vstack((f_wav, f_wav_lead))
                    #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b,  'db1', 3)))


            if DS == 'DS1':
                # Compute PCA
                #PCA = sklearn.decomposition.KernelPCA(pca_k) # gamma_pca
                IPCA = IncrementalPCA(n_components=pca_k, batch_size=10)# NOTE: due to memory errors, we employ IncrementalPCA
                IPCA.fit(f_wav) 

                # Save PCA
                save_wvlt_PCA(IPCA, pca_k, family, level)
            else:
                # Load PCAfrom sklearn.decomposition import PCA, IncrementalPCA
                IPCA = load_wvlt_PCA( pca_k, family, level)
            # Extract the PCA
            #f_wav_PCA = np.empty((0, pca_k * num_leads))
            f_wav_PCA = IPCA.transform(f_wav)
            features = np.column_stack((features, f_wav_PCA))  if features.size else f_wav_PCA

        # HOS
        if 'HOS' in compute_morph:
            print("HOS ...")
            n_intervals = 6
            lag = int(round( (winL + winR )/ n_intervals))

            f_HOS = np.empty((0, (n_intervals-1) * 2 * num_leads))
            for p in range(len(my_db.beat)):
                for b in my_db.beat[p]:
                    f_HOS_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_HOS_lead.size == 1:
                                f_HOS_lead =  compute_hos_descriptor(b[s], n_intervals, lag)
                            else:
                                f_HOS_lead = np.hstack((f_HOS_lead, compute_hos_descriptor(b[s], n_intervals, lag)))
                    f_HOS = np.vstack((f_HOS, f_HOS_lead))
                    #f_HOS = np.vstack((f_HOS, compute_hos_descriptor(b, n_intervals, lag)))

            features = np.column_stack((features, f_HOS))  if features.size else f_HOS
            print(features.shape)

        # My morphological descriptor
        if 'myMorph' in compute_morph:
            print("My Descriptor ...")
            f_myMorhp = np.empty((0,4 * num_leads))
            for p in range(len(my_db.beat)):
                for b in my_db.beat[p]:
                    f_myMorhp_lead = np.empty([])
                    for s in range(2):
                        if leads_flag[s] == 1:
                            if f_myMorhp_lead.size == 1:
                                f_myMorhp_lead =  compute_my_own_descriptor(b[s], winL, winR)
                            else:
                                f_myMorhp_lead = np.hstack((f_myMorhp_lead, compute_my_own_descriptor(b[s], winL, winR)))
                    f_myMorhp = np.vstack((f_myMorhp, f_myMorhp_lead))
                    #f_myMorhp = np.vstack((f_myMorhp, compute_my_own_descriptor(b, winL, winR)))
                    
            features = np.column_stack((features, f_myMorhp))  if features.size else f_myMorhp

        
        labels = np.array(sum(my_db.class_ID, [])).flatten()
        print("labels")

        # Set labels array!
        print('writing pickle: ' +  features_labels_name + '...')
        f = open(features_labels_name, 'wb')
        pickle.dump([features, labels, patient_num_beats], f, 2)
        f.close

    return features, labels, patient_num_beats


# DS: contains the patient list for load
# winL, winR: indicates the size of the window centred at R-peak at left and right side
# do_preprocess: indicates if preprocesing of remove baseline on signal is performed
def load_signal(DS, winL, winR, do_preprocess):

    class_ID = [[] for i in range(len(DS))]
    beat = [[] for i in range(len(DS))] # record, beat, lead
    R_poses = [ np.array([]) for i in range(len(DS))]
    Original_R_poses = [ np.array([]) for i in range(len(DS))]   
    valid_R = [ np.array([]) for i in range(len(DS))]
    my_db = mit_db()
    patients = []

    # Lists 
    # beats = []
    # classes = []
    # valid_R = np.empty([])
    # R_poses = np.empty([])
    # Original_R_poses = np.empty([])

    size_RR_max = 20

    pathDB = '/home/mondejar/dataset/ECG/'
    DB_name = 'mitdb'
    fs = 360
    jump_lines = 1

    # Read files: signal (.csv )  annotations (.txt)    
    fRecords = list()
    fAnnotations = list()

    lst = os.listdir(pathDB + DB_name + "/csv")
    lst.sort()
    for file in lst:
        if file.endswith(".csv"):
            if int(file[0:3]) in DS:
                fRecords.append(file)
        elif file.endswith(".txt"):
            if int(file[0:3]) in DS:
                fAnnotations.append(file)        

    MITBIH_classes = ['N', 'L', 'R', 'e', 'j', 'A', 'a', 'J', 'S', 'V', 'E', 'F']#, 'P', '/', 'f', 'u']
    AAMI_classes = []
    AAMI_classes.append(['N', 'L', 'R'])                    # N
    AAMI_classes.append(['A', 'a', 'J', 'S', 'e', 'j'])     # SVEB 
    AAMI_classes.append(['V', 'E'])                         # VEB
    AAMI_classes.append(['F'])                              # F
    #AAMI_classes.append(['P', '/', 'f', 'u'])              # Q

    RAW_signals = []
    r_index = 0

    #for r, a in zip(fRecords, fAnnotations):
    for r in range(0, len(fRecords)):

        print("Processing signal " + str(r) + " / " + str(len(fRecords)) + "...")

        # 1. Read signalR_poses
        filename = pathDB + DB_name + "/csv/" + fRecords[r]
        print filename
        f = open(filename, 'rb')
        reader = csv.reader(f, delimiter=',')
        next(reader) # skip first line!
        MLII_index = 1
        V1_index = 2
        if int(fRecords[r][0:3]) == 114:
            MLII_index = 2
            V1_index = 1

        MLII = []
        V1 = []
        for row in reader:
            MLII.append((int(row[MLII_index])))
            V1.append((int(row[V1_index])))
        f.close()


        RAW_signals.append((MLII, V1)) ## NOTE a copy must be created in order to preserve the original signal
        # display_signal(MLII)

        # 2. Read annotations
        filename = pathDB + DB_name + "/csv/" + fAnnotations[r]
        print filename
        f = open(filename, 'rb')
        next(f) # skip first line!

        annotations = []
        for line in f:
            annotations.append(line)
        f.close
        # 3. Preprocessing signal!
        if do_preprocess:
            #scipy.signal
            # median_filter1D
            baseline = medfilt(MLII, 71) 
            baseline = medfilt(baseline, 215) 

            # Remove Baseline
            for i in range(0, len(MLII)):
                MLII[i] = MLII[i] - baseline[i]

            # TODO Remove High Freqs

            # median_filter1D
            baseline = medfilt(V1, 71) 
            baseline = medfilt(baseline, 215) 

            # Remove Baseline
            for i in range(0, len(V1)):
                V1[i] = V1[i] - baseline[i]


        # Extract the R-peaks from annotations
        for a in annotations:
            aS = a.split()
            
            pos = int(aS[1])
            originalPos = int(aS[1])
            classAnttd = aS[2]
            if pos > size_RR_max and pos < (len(MLII) - size_RR_max):
                index, value = max(enumerate(MLII[pos - size_RR_max : pos + size_RR_max]), key=operator.itemgetter(1))
                pos = (pos - size_RR_max) + index

            peak_type = 0
            #pos = pos-1
            
            if classAnttd in MITBIH_classes:
                if(pos > winL and pos < (len(MLII) - winR)):
                    beat[r].append( (MLII[pos - winL : pos + winR], V1[pos - winL : pos + winR]))
                    for i in range(0,len(AAMI_classes)):
                        if classAnttd in AAMI_classes[i]:
                            class_AAMI = i
                            break #exit loop
                    #convert class
                    class_ID[r].append(class_AAMI)

                    valid_R[r] = np.append(valid_R[r], 1)
                else:
                    valid_R[r] = np.append(valid_R[r], 0)
            else:
                valid_R[r] = np.append(valid_R[r], 0)
            
            R_poses[r] = np.append(R_poses[r], pos)
            Original_R_poses[r] = np.append(Original_R_poses[r], originalPos)
        
        #R_poses[r] = R_poses[r][(valid_R[r] == 1)]
        #Original_R_poses[r] = Original_R_poses[r][(valid_R[r] == 1)]

        
    # Set the data into a bigger struct that keep all the records!
    my_db.filename = fRecords

    my_db.raw_signal = RAW_signals
    my_db.beat = beat # record, beat, lead
    my_db.class_ID = class_ID
    my_db.valid_R = valid_R
    my_db.R_pos = R_poses
    my_db.orig_R_pos = Original_R_poses

    return my_db