Commit f5cf4314 authored by Billy Amélie's avatar Billy Amélie
Browse files

Merge branch 'AmelieBranch' into 'master'

competition score + transfer learning modification + model retraining

See merge request !21
parents f631da4a 4e913ea3
This source diff could not be displayed because it is too large. You can view the blob instead.
%% Cell type:markdown id: tags:
# CNN superposition + MLP
%% Cell type:markdown id: tags:
https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import os
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
```
%% Cell type:markdown id: tags:
## A - Preprocessing : Reading Data
%% Cell type:code id: tags:
``` python
os.chdir('../')
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import read_data
input_directory='../osic-pulmonary-fibrosis-progression'
train_df, test_df, sample_df = read_data(input_directory)
train_df.head()
```
%%%% Output: execute_result
Patient Weeks FVC Percent Age Sex SmokingStatus
0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker
1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker
2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker
3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker
4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker
%% Cell type:markdown id: tags:
## B - Preprocessing : Loading Data
%% Cell type:code id: tags:
``` python
patients_train_ids= train_df.Patient.unique()
patient_test_list= test_df.Patient.unique()
patients_train_ids = [pat for pat in patients_train_ids]
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import load_images
logging.info("loading attributes...")
df = pd.read_csv(f'{input_directory}/train.csv')
df = df.drop_duplicates(subset = 'Patient', keep='first')
patients_train_ids= df.Patient.unique().tolist()
df = df[df['Patient'].isin(patients_train_ids)]
logging.info("loading images...")
images = load_images(input_directory,
'train',
patients_train_ids,
option='superposition',
outputH = 240,
outputW = 240)
print("Array shape: ", images.shape)
#check value between -1,1
print('min value: ', np.amin(images))
print('max value: ', np.amax(images))
```
%% Cell type:markdown id: tags:
## C - Preprocessing : shuffle
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import train_test_split
split = train_test_split(df, images, test_size=0.2, random_state=42)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
```
%% Cell type:markdown id: tags:
## D - Preprocessing : Scaling + Encoding
%% Cell type:code id: tags:
``` python
from preprocessing.scale_data import scale_variable
sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'FVC')
trainY = trainAttrX.loc[:,'FVC_scaled']
testY = testAttrX.loc[:,'FVC_scaled']
```
%% Cell type:code id: tags:
``` python
from preprocessing.scale_data import encode_variable
trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'Sex')
trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'SmokingStatus')
trainAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)
testAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)
```
%% Cell type:markdown id: tags:
## E - Processing : Create models
%% Cell type:code id: tags:
``` python
from processing.models import create_hybrid2
```
%% Cell type:code id: tags:
``` python
from processing.models import create_hybrid2
from keras.optimizers import Adam
model = create_hybrid2(trainAttrX.shape[1], shape = (240,240,4))
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
```
%% Cell type:code id: tags:
``` python
model.summary()
```
%% Cell type:code id: tags:
``` python
%%time
hist = model.fit(
x=[trainAttrX, trainImagesX], y=trainY,
validation_data=([testAttrX, testImagesX], testY),
epochs=10, batch_size=8)
```
%% Cell type:code id: tags:
``` python
from postprocessing.plot_history import plot_history
plot_history(hist)
```
%%%% Output: display_data
![]()
%% Cell type:markdown id: tags:
# F - Evaluation
%% Cell type:markdown id: tags:
### Training set
%% Cell type:code id: tags:
``` python
from postprocessing.evaluate import evaluate_hybrid, compute_score
```
%% Cell type:code id: tags:
``` python
preds = evaluate_hybrid(model, df, trainAttrX, trainImagesX, trainY, sc)
conf, score = compute_score(trainY,preds.flatten())
print('competition score :', score)
```
%% Cell type:code id: tags:
``` python
model.evaluate([trainAttrX, trainImagesX], trainY)
```
%%%% Output: execute_result
240.91358947753906
%% Cell type:markdown id: tags:
### Test set
%% Cell type:code id: tags:
``` python
preds = evaluate_hybrid(model, df, testAttrX, testImagesX, testY, sc)
conf, score = compute_score(testY,preds.flatten())
print('competition score :', score)
```
%% Cell type:code id: tags:
``` python
model.evaluate([testAttrX, testImagesX], testY)
```
%%%% Output: execute_result
199.35498046875
%% Cell type:code id: tags:
``` python
_a=model.predict([trainAttrX, trainImagesX])
```
%% Cell type:code id: tags:
``` python
q=0.5
a = np.quantile(_a, q)
```
%% Cell type:code id: tags:
``` python
_a[:,1]
```
%%%% Output: error
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-31-8ed85e029818> in <module>
----> 1 _a[:,1]
IndexError: index 1 is out of bounds for axis 1 with size 1
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
# G - Sample submission file
%% Cell type:code id: tags:
``` python
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import GaussianNoise
```
%% Cell type:code id: tags:
``` python
def create_mlp2(dim,regress = True):
model = Sequential()
model.add(GaussianNoise(0.2, input_dim=dim))
model.add(Dense(8, activation="relu"))
model.add(Dense(4, activation="relu"))
# add dense for regression
model.add(Dense(1, activation="linear"))
return model
```
%% Cell type:code id: tags:
``` python
```
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -64,21 +64,21 @@
``` python
from preprocessing.read_load_data import load_images
logging.info("loading attributes...")
df = pd.read_csv(f'{input_directory}/train.csv')
df = df.drop_duplicates(subset = 'Patient', keep='first')
df = df.sort_values("Weeks").drop_duplicates(subset = 'Patient', keep='first')
patients_train_ids= df.Patient.unique().tolist()
df = df[df['Patient'].isin(patients_train_ids)]
logging.info("loading images...")
images = load_images(input_directory,
'train',
patients_train_ids,
option='superposition',
outputH = 240,
outputW = 240)
outputH = 240,
outputW = 240)
print("Array shape: ", images.shape)
#check value between -1,1
print('min value: ', np.amin(images))
print('max value: ', np.amax(images))
......@@ -116,93 +116,674 @@
## E - Processing : Create models
%% Cell type:code id: tags:
``` python
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
#set early stopping criteria
pat = 10 #this is the number of epochs with no improvment after which the training will stop
es = EarlyStopping(monitor='val_loss', patience=pat, verbose=1)
#define the model checkpoint callback -> this will keep on saving the model as a physical file
cp = ModelCheckpoint('clean_notebooks/cnn_transfer_learning_weights.h5', verbose=1, save_best_only=True)
```
%% Cell type:code id: tags:
``` python
def custom_shuffle_split(trainAttrX,train_dataset,trainY,test_size = 0.1 ):
cut = int(len(trainY)*test_size)
arr = list(np.arange(len(trainY)))
np.random.shuffle(arr)
trainidx = arr[cut:]
testidx = arr[:cut]
train_x, train_y = train_dataset[trainidx] , trainY[trainidx]
val_x, val_y = train_dataset[testidx], trainY[testidx]
return train_x, val_x, train_y, val_y
```
%% Cell type:code id: tags:
``` python
trainY = trainAttrX.loc[:,'FVC_scaled'].copy()
trainY = trainY.reset_index( drop = True)
trainAttrX.reset_index(inplace=True)
```
%% Cell type:code id: tags:
``` python
from processing.models import create_transfer_learning
from keras.optimizers import Adam
from tensorflow.keras.models import Model
import efficientnet.tfkeras as efn
new_model =efn.EfficientNetB1(weights='imagenet',include_top=False)
input_channel = 4
config = new_model.get_config()
config["layers"][0]["config"]["batch_input_shape"] = (None, 240, 240, input_channel)
modify_name = config["layers"][1]["config"]["name"]
custom_model = Model.from_config(config)
model = create_transfer_learning(new_model,custom_model,modify_name, input_channel)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
```
%% Cell type:code id: tags:
``` python
%%time
hist = model.fit(x=trainImagesX, y=trainY,
validation_data=(testImagesX, testY),
epochs=20, batch_size=8)
from time import time
from processing.models import fit_and_evaluate
t0 = time()
n_folds = 3
epochs = 30
batch_size = 8
#save the model history in a list after fitting so that we can plot later
model_history = []
for i in range(n_folds):
print("Training on Fold: ",i+1)
new_model = efn.EfficientNetB1(weights='imagenet',include_top=False)
input_channel = 4
config = new_model.get_config()
config["layers"][0]["config"]["batch_input_shape"] = (None, 240, 240, input_channel)
modify_name = config["layers"][1]["config"]["name"]
custom_model = Model.from_config(config)
model = create_transfer_learning(new_model,custom_model,modify_name, input_channel,weights=True)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_squared_error", optimizer=opt)
t_x, val_x, t_y, val_y = custom_shuffle_split(trainAttrX ,trainImagesX,trainY,test_size = 0.1)
model_history.append(fit_and_evaluate(t_x, val_x, t_y, val_y, epochs, batch_size,model,es,cp))
print("======="*12, end="\n\n\n")
print("Computation time : ", round((time() - t0)/60,3), "min")
```
%% Cell type:code id: tags:
``` python
from postprocessing.plot_history import plot_history
import matplotlib.pyplot as plt
plot_history(hist)
plt.title('Loss vs Epochs')
plt.plot(model_history[0].history['loss'], label='Training Fold 1')
plt.plot(model_history[1].history['loss'], label='Training Fold 2')
plt.plot(model_history[2].history['loss'], label='Training Fold 3')
plt.legend()
plt.show()
```
%%%% Output: display_data
![]()
![]()
%% Cell type:markdown id: tags:
# F - Evaluation
# F - Postprocessing : Evaluation
%% Cell type:code id: tags:
``` python
from keras.models import load_model
from efficientnet.tfkeras import EfficientNetB1
model = None
model = load_model('clean_notebooks/cnn_transfer_learning_weights.h5')
```
%% Cell type:code id: tags:
``` python
from postprocessing.evaluate import evaluate_cnn, compute_score
```
%% Cell type:code id: tags:
``` python
preds = evaluate_cnn(model, df, trainImagesX, trainY, sc)
conf, score = compute_score(trainY,preds.flatten())
print('competition score :', score)
def evaluate_cnn(model,df, trainImagesX, trainY,sc):
logging.info("predicting ...")
preds = model.predict(trainImagesX)
diff = preds.flatten() - trainY
percentDiff = (diff / (trainY)) * 100
absPercentDiff = np.abs(percentDiff)
mean = np.mean(absPercentDiff)
std = np.std(absPercentDiff)
print("avg. FVC: {}, std FVC {}".format(df["FVC"].mean(), df["FVC"].std()))
print("mean difference : {:.2f}%, std: {:.2f}%".format(mean, std))
return preds,diff
```
%% Cell type:code id: tags:
``` python
preds_train,diff = evaluate_cnn(model, df, trainImagesX, trainY, sc)
np.sqrt(np.mean(diff*diff))
```
%%%% Output: execute_result
1.0000517762438685
%% Cell type:code id: tags:
``` python
model.evaluate(trainImagesX, trainY)
```
%%%% Output: execute_result
121.15312194824219
1.0001035928726196
%% Cell type:markdown id: tags:
### Test set
%% Cell type:code id: tags:
``` python
preds = evaluate_cnn(model, df, testImagesX, testY, sc)
conf, score = compute_score(testY,preds.flatten())
print('competition score :', score)
preds,diff = evaluate_cnn(model, df, testImagesX, testY, sc)
```
%% Cell type:code id: tags:
``` python
model.evaluate(testImagesX, testY)
```
%%%% Output: execute_result
94.78245544433594
0.8139927387237549
%% Cell type:markdown id: tags:
# G - Postprocessing : Competition score
%% Cell type:markdown id: tags:
## Dropout 0.5
%% Cell type:markdown id: tags:
### Train set
%% Cell type:code id: tags: