빅데이터분석/파이썬 활용

!UCSD_ANAMALY_DATASET + LRCN

cheonbi 2019. 11. 11. 14:09

# default
import io
import pickle
import pandas as pd
import numpy as np
import re
import glob
import itertools

# dataset
from sklearn.datasets import load_iris, make_moons, load_breast_cancer
from keras.datasets import imdb, cifar10

# preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from keras.utils import np_utils, multi_gpu_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from sklearn.manifold import TSNE

# graphics
%matplotlib inline
import pydot
from IPython.core.display import Image
# from PIL import Image
from sklearn.tree import export_graphviz
import seaborn as sns
import mglearn
import matplotlib as mpl
import matplotlib.pyplot as plt

# general
import tensorflow as tf
from keras import layers
import keras.backend as K
from keras.layers import Input, Dense, Activation, Flatten, Dropout, BatchNormalization, Embedding, Reshape
from keras.layers.convolutional import Conv2D, Convolution2D, MaxPooling2D, AveragePooling2D, UpSampling2D
from keras.layers.convolutional import Conv3D, Convolution3D, MaxPooling3D, AveragePooling3D, UpSampling3D
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, ZeroPadding2D, AveragePooling2D, GlobalAveragePooling2D
from keras.layers import LSTM, TimeDistributed, CuDNNLSTM
from keras.models import Model, load_model
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint

# classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# evaluation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from keras.metrics import top_k_categorical_accuracy

# customize
def plot_decision_regions(X, y, model, title):
    resolution = 0.01
    markers = ('s', '^', 'o')
    colors = ('red', 'blue', 'lightgreen')
    cmap = mpl.colors.ListedColormap(colors)

    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    Z = model.predict(np.array([xx1.ravel(), xx2.ravel()]).T).reshape(xx1.shape)
    resolution = 0.01
    markers = ('s', '^', 'o')
    colors = ('red', 'blue', 'lightgreen')
    cmap = mpl.colors.ListedColormap(colors)

    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    Z = model.predict(np.array([xx1.ravel(), xx2.ravel()]).T).reshape(xx1.shape)
    plt.contour(xx1, xx2, Z, cmap=mpl.colors.ListedColormap(['k']))
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8,
        c=[cmap(idx)], marker=markers[idx], s=80, label=cl)
    plt.xlabel(iris.feature_names[2])
    plt.ylabel(iris.feature_names[3])
    plt.legend(loc='upper left')
    plt.title(title)
    return Z

def plot_confusion_matrix(model, X_target, Y_target, label_info):
    Y_pred = model.predict(X_target)
    cnf_matrix = confusion_matrix(np.argmax(Y_pred, axis=1), np.argmax(Y_target, axis=1))
    plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    marks = np.arange(len(label_info))
    plt.xticks(marks, label_info, rotation=45)
    plt.xticks(marks, label_info, rotation=45)
    thresh = cnf_matrix.max()/2.
    for i in range(cnf_matrix.shape[0]):
        for j in range(cnf_matrix.shape[1]):
            plt.text(j, i, cnf_matrix[i, j], horizontalalignment='center', color='white' if cnf_matrix[i,j]>thresh else 'black')
    plt.ylabel('Actual'), plt.xlabel('Predicted')
    plt.tight_layout()
    plt.show()
 
# Preprocessing Method
def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data
def load_cifar_10_data(data_dir, negatives=False):
    meta_data_dict = unpickle(data_dir+'/batches.meta')
    cifar_label_names = meta_data_dict[b'label_names']
    cifar_label_names = np.array(cifar_label_names)
    cifar_train_data = None
    cifar_train_filenames = []
    cifar_train_labels = []
    for i in range(1, 6):
        cifar_train_data_dict = unpickle(data_dir+'/data_batch_{}'.format(i))
        if i==1:
            cifar_train_data = cifar_train_data_dict[b'data']
        else:
            cifar_train_data = np.vstack((cifar_train_data, cifar_train_data_dict[b'data']))
        cifar_train_filenames += cifar_train_data_dict[b'filenames']
        cifar_train_labels += cifar_train_data_dict[b'labels']
    cifar_train_data = cifar_train_data.reshape((len(cifar_train_data), 3, 32, 32))
    if negatives:
        cifar_train_data = cifar_train_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_train_data = np.rollaxis(cifar_train_data, 1, 4)
    cifar_train_filenames = np.array(cifar_train_filenames)
    cifar_train_labels = np.array(cifar_train_labels)
    cifar_test_data_dict = unpickle(data_dir+'/test_batch')
    cifar_test_data = cifar_test_data_dict[b'data']
    cifar_test_filenames = cifar_test_data_dict[b'filenames']
    cifar_test_labels = cifar_test_data_dict[b'labels']
    cifar_test_data = cifar_test_data.reshape((len(cifar_test_data), 3, 32, 32))
    if negatives:
        cifar_test_data = cifar_test_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_test_data = np.rollaxis(cifar_test_data, 1, 4)
    cifar_test_filenames = np.array(cifar_test_filenames)
    cifar_test_labels = np.array(cifar_test_labels)
    return cifar_train_data, cifar_train_filenames, cifar_train_labels, cifar_test_data, cifar_test_filenames, cifar_test_labels, cifar_label_names

# Handcrafted Function Def.
def wrapped_confusion_matrix(modelInput, feature, label, class_list=["first", "second"], normalize=False, fig_size=(5,5)):
    pred = modelInput.predict(feature)
    cnf_matrix = confusion_matrix(np.argmax(label, axis=1), np.argmax(pred, axis=1))
    np.set_printoptions(precision=2)
    plt.figure(figsize=fig_size)
    plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.xticks(np.arange(len(class_list)), class_list, rotation=45)
    plt.yticks(np.arange(len(class_list)), class_list)
    if(normalize):
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    thresh = cnf_matrix.max() / 2.
    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, cnf_matrix[i, j],
                 horizontalalignment='center',
                 color='white' if cnf_matrix[i, j] > thresh else 'black')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
def plot_history(model_input, mode='loss', val_mode=True, title='Entropy'):
    history = model_input.history
    if(val_mode):
        plt.plot(history.history[mode])
        plt.plot(history.history['val_'+mode])
        plt.title(title)
        plt.legend(['train', 'test'], loc='upper right')
        plt.show()
    else:
        plt.plot(history.history[mode])
        plt.title(title)
        plt.legend(['train'], loc='upper right')
        plt.show()




# Analysis: LRCN(2-classification)

# data loading and preprocessing
X = np.load('./dataset/ped2_X_win10.npy')
Y = np.load('./dataset/ped2_Y_win10.npy')
## transfering of data
Y = np_utils.to_categorical(Y)
print('\nRaw set of X and Y: ', X.shape, Y.shape)
## split of data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=123)
print('\nTrain set: ', X_train.shape, Y_train.shape, 'Test set: ', X_test.shape, Y_test.shape)
## scaling of data
X_train, X_test = X_train/255., X_test/255.
print('\nTrain set: ', X_train.shape, Y_train.shape, 'Test set: ', X_test.shape, Y_test.shape)
## sample
plt.imshow(X[0][9])
plt.show()
## frames of specific sample
## 10 is frame of movie, 30*40 picture
sample_num = 4
random_window_idx = np.random.randint(0, X_train.shape[0], sample_num)
random_frame_idx = np.random.randint(0, 10, row_limit)
X_sample = X_train[random_window_idx]
X_sample_label = Y_train[random_window_idx]
fig, ax = plt.subplots(row_limit, 10,
                       subplot_kw={'xticks': [], 'yticks': []},
                       figsize=(15,6))
for r in range(sample_num):
    for j in range(10):
        ax[r][j].imshow(X_sample[r][j], cmap=plt.cm.gray)
plt.show()

# modeling
X_input = Input(X_train.shape[1:])
X_hidden = Reshape((X_train.shape[1], X_train.shape[2], X_train.shape[3], 1))(X_input)
X_hidden = TimeDistributed(Convolution2D(filters=32, kernel_size=(2, 2), padding='same', activation='relu'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2,2)))(X_hidden)
X_hidden = TimeDistributed(Convolution2D(filters=32, kernel_size=(2, 2), padding='same', activation='relu'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2,2)))(X_hidden)
X_hidden = TimeDistributed(Convolution2D(filters=32, kernel_size=(2, 2), padding='same', activation='relu'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2,2)))(X_hidden)
X_hidden = Dropout(0.5)(X_hidden)
X_hidden = Reshape((10, 3*5*32))(X_hidden)
# is same as
# X_hidden = TimeDistributed(Flatten())(X_hidden)
X_hidden = LSTM(256, return_sequences=True)(X_hidden)
X_hidden = LSTM(256, return_sequences=False)(X_hidden)
X_hidden = Dropout(0.5)(X_hidden)
X_hidden = Dense(64, activation='relu')(X_hidden)
X_output = Dense(Y_train.shape[1], activation='softmax')(X_hidden)
model_lrcn = Model(X_input, X_output)
display(model_lrcn.summary())
## optimization step
model_lrcn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
es = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
chk = ModelCheckpoint('./model/lrcn.h5', monitor='val_loss', verbose=1, save_best_only=False, mode='min')
model_lrcn_history = model_lrcn.fit(X_train, Y_train, validation_split=0.1, epochs=5, batch_size=64, verbose=2, shuffle=True, callbacks=[es])

# evaluation
plot_history(model_lrcm, mode='loss', title='MSE')
plot_history(model_lrcm, mode='acc', title='Accuracy')
model_lrcn.save('./model/lrcn_eval.h5')

# result
label_info = ['Normal', 'Abnormal']
plot_confusion_matrix(model_lrcn, X_test, Y_test, label_info)
wrapped_confusion_matrix(model_lrcn, X_test, Y_test, class_list=label_info, normalize=False, fig_size=(5,5))
model_lrcn.save('./model/lrcn_result.h5')




# Analysis: Convolutional Autoencoder

# modeling
## Encoder
X_input = Input(X_train.shape[1:])
X_hidden = Reshape((X_train.shape[1], X_train.shape[2], X_train.shape[3], 1))(X_input)
X_hidden = TimeDistributed(Convolution2D(filters=16, kernel_size=(2, 2), padding='same', activation='relu'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2, 2)))(X_hidden)
## Decoder
X_hidden = TimeDistributed(UpSampling2D(size=(2,2)))(X_hidden)
X_hidden = TimeDistributed(Convolution2D(filters=1, kernel_size=(2,2), padding='same', activation='relu'))(X_hidden)
X_output = Reshape((X_train.shape[1], X_train.shape[2], X_train.shape[3]))(X_hidden)
model_ae = Model(X_input, X_output)
display(model_ae.summary())
## optimization step
model_ae.compile(loss='mse', optimizer='adadelta', metrics=['kullback_leibler_divergence', 'accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
chk = ModelCheckpoint('./model/ae.h5', monitor='val_loss', verbose=1, save_best_only=False, mode='min')
model_ae_history = model_ae.fit(X_train, X_train, validation_split=0.1, epochs=5, batch_size=64, verbose=2, shuffle=True, callbacks=[es])

# evaluation
plot_history(model_ae, mode='loss', title='MSE')
plot_history(model_ae, mode='kullback_leibler_divergence', title='KL Divergence')
plot_history(model_ae, mode='acc', title='Accuracy')
model_ae.save('./model/ae_eval.h5')

# result
## 10 is frame of movie, 30*40 picture
sample_num = 4
random_window_idx = np.random.randint(0, X_train.shape[0], sample_num)
random_frame_idx = np.random.randint(0, 10, row_limit)
X_sample = X_train[random_window_idx]

model_ae = load_model('./model/ae_eval.h5')
X_sample_output = model_ae.predict(X_sample)
print(X_sample.shape, X_sample_output.shape)

fig, ax = plt.subplots(sample_num, 2, subplot_kw={'xticks': [], 'yticks': []}, figsize=(6, 8))
for r in range(sample_num):
    ax[r][0].imshow(X_sample[r][0], cmap=plt.cm.gray)
    ax[r][1].imshow(X_sample_output[r][0], cmap=plt.cm.gray)




# Analysis: Pretrained LRCN(2-classification)

# modeling
## Freeze Weight
model_ae = load_model('./model/ae_eval.h5')
for layer in model_ae.layers:
    layer.trainable = False
## LRCN(2-classification)
X_hidden = model_ae.layers[3].output
X_hidden = TimeDistributed(Convolution2D(filters=32, kernel_size=(2, 2), padding='same', activation='tanh'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2, 2)))(X_hidden)
X_hidden = TimeDistributed(Convolution2D(filters=32, kernel_size=(2, 2), padding='same', activation='tanh'))(X_hidden)
X_hidden = TimeDistributed(MaxPooling2D((2, 2)))(X_hidden)
X_hidden = Dropout(0.5)(X_hidden)
X_hidden = Reshape((10, 3*5*32))(X_hidden)
X_hidden = LSTM(256, return_sequences=True)(X_hidden)
X_hidden = LSTM(256, return_sequences=False)(X_hidden)
X_hidden = Dropout(0.5)(X_hidden)
X_hidden = Dense(64, activation='tanh')(X_hidden)
X_output = Dense(Y_train.shape[1], activation='softmax')(X_hidden)
model_lrcn_fw = Model(model_ae.input, X_output)
display(model_lrcn_fw.summary())
## optimization step
model_lrcn_fw.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
chk = ModelCheckpoint('./model/ae.h5', monitor='val_loss', verbose=1, save_best_only=False, mode='min')
model_lrcn_fw_history = model_lrcn_fw.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=64, verbose=2, shuffle=True, callbacks=[es])

# evaluation
plot_history(model_lrcn_fw, mode='loss', title='Categorical Crossentropy')
plot_history(model_lrcn_fw, mode='acc', title='Accuracy')
class_name_list_low = ['Normal', 'Abnormal']
wrapped_confusion_matrix(model_lrcn_fw, X_test, Y_test, class_list=class_name_list_low, normalize=False, fig_size=(5,5))
model_lrcn_fw.save('./model/lrcn_fw_eval.h5')

# result
label_info = ['Normal', 'Abnormal']
plot_confusion_matrix(model_lrcn_fw, X_test, Y_test, label_info)
wrapped_confusion_matrix(model_lrcn_fw, X_test, Y_test, class_list=label_info, normalize=False, fig_size=(5,5))
model_lrcn_fw.save('./model/lrcn_fw_result.h5')