Commit 4e913ea3 authored by Billy Amélie's avatar Billy Amélie
Browse files

competition score + transfer learning modification + model retraining

parent 3f0a09d4
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CNN superposition + MLP"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import logging\n",
"logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A - Preprocessing : Reading Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"os.chdir('../')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Patient</th>\n",
" <th>Weeks</th>\n",
" <th>FVC</th>\n",
" <th>Percent</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>-4</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>5</td>\n",
" <td>2214</td>\n",
" <td>55.712129</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>7</td>\n",
" <td>2061</td>\n",
" <td>51.862104</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>9</td>\n",
" <td>2144</td>\n",
" <td>53.950679</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>11</td>\n",
" <td>2069</td>\n",
" <td>52.063412</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Patient Weeks FVC Percent Age Sex SmokingStatus\n",
"0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker\n",
"1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker\n",
"2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker\n",
"3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker\n",
"4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from preprocessing.read_load_data import read_data\n",
"\n",
"input_directory='../osic-pulmonary-fibrosis-progression'\n",
"train_df, test_df, sample_df = read_data(input_directory) \n",
"train_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## B - Preprocessing : Loading Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"patients_train_ids= train_df.Patient.unique()\n",
"patient_test_list= test_df.Patient.unique()\n",
"patients_train_ids = [pat for pat in patients_train_ids]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:loading attributes...\n",
"INFO:loading images...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Array shape: (176, 240, 240, 4)\n",
"min value: -0.1251496147096971\n",
"max value: 0.16921848376183674\n"
]
}
],
"source": [
"from preprocessing.read_load_data import load_images\n",
"\n",
"logging.info(\"loading attributes...\")\n",
"df = pd.read_csv(f'{input_directory}/train.csv')\n",
"df = df.drop_duplicates(subset = 'Patient', keep='first')\n",
"patients_train_ids= df.Patient.unique().tolist()\n",
"df = df[df['Patient'].isin(patients_train_ids)]\n",
"\n",
"logging.info(\"loading images...\")\n",
"images = load_images(input_directory,\n",
" 'train',\n",
" patients_train_ids,\n",
" option='superposition',\n",
" outputH = 240,\n",
" outputW = 240)\n",
"\n",
"print(\"Array shape: \", images.shape)\n",
"#check value between -1,1\n",
"print('min value: ', np.amin(images))\n",
"print('max value: ', np.amax(images))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## C - Preprocessing : shuffle"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"split = train_test_split(df, images, test_size=0.2, random_state=42)\n",
"(trainAttrX, testAttrX, trainImagesX, testImagesX) = split"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## D - Preprocessing : Scaling + Encoding"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing.scale_data import scale_variable\n",
"\n",
"sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'FVC')\n",
"trainY = trainAttrX.loc[:,'FVC_scaled']\n",
"testY = testAttrX.loc[:,'FVC_scaled']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing.scale_data import encode_variable\n",
"\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'Sex')\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'SmokingStatus')\n",
"\n",
"trainAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)\n",
"testAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## E - Processing : Create models"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from processing.models import create_hybrid2"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from processing.models import create_hybrid2\n",
"from keras.optimizers import Adam\n",
"\n",
"model = create_hybrid2(trainAttrX.shape[1], shape = (240,240,4))\n",
"opt = Adam(lr=1e-3, decay=1e-3 / 200)\n",
"model.compile(loss=\"mean_absolute_percentage_error\", optimizer=opt)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"model_1\"\n",
"__________________________________________________________________________________________________\n",
"Layer (type) Output Shape Param # Connected to \n",
"==================================================================================================\n",
"input_1 (InputLayer) [(None, 240, 240, 4) 0 \n",
"__________________________________________________________________________________________________\n",
"conv2d (Conv2D) (None, 240, 240, 32) 1184 input_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"activation (Activation) (None, 240, 240, 32) 0 conv2d[0][0] \n",
"__________________________________________________________________________________________________\n",
"batch_normalization (BatchNorma (None, 240, 240, 32) 128 activation[0][0] \n",
"__________________________________________________________________________________________________\n",
"max_pooling2d (MaxPooling2D) (None, 120, 120, 32) 0 batch_normalization[0][0] \n",
"__________________________________________________________________________________________________\n",
"conv2d_1 (Conv2D) (None, 120, 120, 64) 18496 max_pooling2d[0][0] \n",
"__________________________________________________________________________________________________\n",
"activation_1 (Activation) (None, 120, 120, 64) 0 conv2d_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"batch_normalization_1 (BatchNor (None, 120, 120, 64) 256 activation_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"max_pooling2d_1 (MaxPooling2D) (None, 60, 60, 64) 0 batch_normalization_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"conv2d_2 (Conv2D) (None, 60, 60, 128) 73856 max_pooling2d_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"activation_2 (Activation) (None, 60, 60, 128) 0 conv2d_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"batch_normalization_2 (BatchNor (None, 60, 60, 128) 512 activation_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"max_pooling2d_2 (MaxPooling2D) (None, 30, 30, 128) 0 batch_normalization_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"flatten (Flatten) (None, 115200) 0 max_pooling2d_2[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_3 (Dense) (None, 16) 1843216 flatten[0][0] \n",
"__________________________________________________________________________________________________\n",
"gaussian_noise_input (InputLaye [(None, 5)] 0 \n",
"__________________________________________________________________________________________________\n",
"activation_3 (Activation) (None, 16) 0 dense_3[0][0] \n",
"__________________________________________________________________________________________________\n",
"gaussian_noise (GaussianNoise) (None, 5) 0 gaussian_noise_input[0][0] \n",
"__________________________________________________________________________________________________\n",
"batch_normalization_3 (BatchNor (None, 16) 64 activation_3[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense (Dense) (None, 8) 48 gaussian_noise[0][0] \n",
"__________________________________________________________________________________________________\n",
"dropout (Dropout) (None, 16) 0 batch_normalization_3[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_1 (Dense) (None, 4) 36 dense[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_4 (Dense) (None, 4) 68 dropout[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_2 (Dense) (None, 1) 5 dense_1[0][0] \n",
"__________________________________________________________________________________________________\n",
"activation_4 (Activation) (None, 4) 0 dense_4[0][0] \n",
"__________________________________________________________________________________________________\n",
"concatenate (Concatenate) (None, 5) 0 dense_2[0][0] \n",
" activation_4[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_5 (Dense) (None, 4) 24 concatenate[0][0] \n",
"__________________________________________________________________________________________________\n",
"dense_6 (Dense) (None, 1) 5 dense_5[0][0] \n",
"==================================================================================================\n",
"Total params: 1,937,898\n",
"Trainable params: 1,937,418\n",
"Non-trainable params: 480\n",
"__________________________________________________________________________________________________\n"
]
}
],
"source": [
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/10\n",
"18/18 [==============================] - 9s 521ms/step - loss: 6067.2251 - val_loss: 2709.1370\n",
"Epoch 2/10\n",
"18/18 [==============================] - 9s 519ms/step - loss: 3588.0178 - val_loss: 1215.3599\n",
"Epoch 3/10\n",
"18/18 [==============================] - 9s 523ms/step - loss: 1767.9198 - val_loss: 575.0400\n",
"Epoch 4/10\n",
"18/18 [==============================] - 9s 519ms/step - loss: 921.6010 - val_loss: 212.0080\n",
"Epoch 5/10\n",
"18/18 [==============================] - 9s 511ms/step - loss: 495.6474 - val_loss: 94.8844\n",
"Epoch 6/10\n",
"18/18 [==============================] - 9s 525ms/step - loss: 394.5772 - val_loss: 164.4984\n",
"Epoch 7/10\n",
"18/18 [==============================] - 9s 512ms/step - loss: 245.1468 - val_loss: 201.1050\n",
"Epoch 8/10\n",
"18/18 [==============================] - 9s 514ms/step - loss: 264.4300 - val_loss: 231.4748\n",
"Epoch 9/10\n",
"18/18 [==============================] - 10s 529ms/step - loss: 232.9314 - val_loss: 210.3131\n",
"Epoch 10/10\n",
"18/18 [==============================] - 9s 514ms/step - loss: 189.8296 - val_loss: 199.3550\n",
"CPU times: user 9min 39s, sys: 24.3 s, total: 10min 4s\n",
"Wall time: 1min 39s\n"
]
}
],
"source": [
"%%time\n",
"hist = model.fit(\n",
" x=[trainAttrX, trainImagesX], y=trainY,\n",
" validation_data=([testAttrX, testImagesX], testY),\n",
" epochs=10, batch_size=8)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from postprocessing.plot_history import plot_history\n",
"\n",
"plot_history(hist)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# F - Evaluation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training set"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from postprocessing.evaluate import evaluate_hybrid, compute_score"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:predicting ...\n",
"INFO:NumExpr defaulting to 8 threads.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"avg. FVC: 2771.744318181818, std FVC 835.5745106360505\n",
"mean difference : 36.84%, std: 45.71%\n",
"competition score : -4.617284040908737\n"
]
}
],
"source": [
"preds = evaluate_hybrid(model, df, trainAttrX, trainImagesX, trainY, sc)\n",
"conf, score = compute_score(trainY,preds.flatten())\n",
"print('competition score :', score)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5/5 [==============================] - 1s 273ms/step - loss: 240.9136\n"
]
},
{
"data": {
"text/plain": [
"240.91358947753906"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.evaluate([trainAttrX, trainImagesX], trainY)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test set"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:predicting ...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"avg. FVC: 2771.744318181818, std FVC 835.5745106360505\n",
"mean difference : 46.17%, std: 39.98%\n",
"competition score : -4.619806995461278\n"
]
}
],
"source": [
"preds = evaluate_hybrid(model, df, testAttrX, testImagesX, testY, sc)\n",
"conf, score = compute_score(testY,preds.flatten())\n",
"print('competition score :', score)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2/2 [==============================] - 0s 31ms/step - loss: 199.3550\n"
]
},
{
"data": {
"text/plain": [
"199.35498046875"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.evaluate([testAttrX, testImagesX], testY)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"_a=model.predict([trainAttrX, trainImagesX])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"q=0.5\n",
"a = np.quantile(_a, q)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "index 1 is out of bounds for axis 1 with size 1",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-31-8ed85e029818>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0m_a\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m: index 1 is out of bounds for axis 1 with size 1"
]