Commit 8b826bad authored by Lafnoune Imane's avatar Lafnoune Imane
Browse files

CNN_inj_transfer erreurs

parent f0621fb5
%% Cell type:markdown id: tags:
# CNN with transfer learning with weights (efficientnet) and data injection
This notebook contains the configurations required to train an efficientnet model for K-folds.
It is possible to hit -0.6910 LB by tweaking parameters in this notebook!
https://www.kaggle.com/khoongweihao/k-fold-tf-efficientnet-models-training
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import os
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
```
%% Cell type:markdown id: tags:
## A - Preprocessing : Reading Data
%% Cell type:code id: tags:
``` python
os.chdir('../')
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import read_data
input_directory='../osic-pulmonary-fibrosis-progression'
train_df, test_df, sample_df = read_data(input_directory)
train_df.head()
```
%% Output
Patient Weeks FVC Percent Age Sex SmokingStatus
0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker
1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker
2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker
3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker
4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker
%% Cell type:markdown id: tags:
## B - Preprocessing : Loading Data
%% Cell type:code id: tags:
``` python
patients_train_ids= train_df.Patient.unique()
patient_test_list= test_df.Patient.unique()
patients_train_ids = [pat for pat in patients_train_ids]
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import load_images
logging.info("loading attributes...")
df = pd.read_csv(f'{input_directory}/train.csv')
df = df.drop_duplicates(subset = 'Patient', keep='first')
patients_train_ids= df.Patient.unique().tolist()
df = df[df['Patient'].isin(patients_train_ids)]
logging.info("loading images...")
images = load_images(input_directory,
'train',
patients_train_ids,
option='superposition',
outputH = 240,
outputW = 240)
print("Array shape: ", images.shape)
#check value between -1,1
print('min value: ', np.amin(images))
print('max value: ', np.amax(images))
```
%% Output
INFO:loading attributes...
INFO:loading images...
Array shape: (176, 240, 240, 4)
min value: -0.1251496147096971
max value: 0.1692184837618322
%% Cell type:markdown id: tags:
## C - Preprocessing : shuffle
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import train_test_split
split = train_test_split(df, images, test_size=0.2, random_state=42)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
```
%% Cell type:markdown id: tags:
## D - Preprocessing : Scaling + Encoding
%% Cell type:code id: tags:
``` python
from preprocessing.scale_data import scale_variable
sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'FVC')
trainY = trainAttrX.loc[:,'FVC_scaled']
testY = testAttrX.loc[:,'FVC_scaled']
```
%% Cell type:code id: tags:
``` python
from preprocessing.scale_data import encode_variable
trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'Sex')
trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'SmokingStatus')
trainAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)
testAttrX.drop(columns = ['Sex','SmokingStatus','FVC','FVC_scaled','Patient'], inplace = True)
```
%% Cell type:markdown id: tags:
## E - Processing : Create models
%% Cell type:code id: tags:
``` python
from processing.models import create_hybrid_transfer
from keras.optimizers import Adam
from tensorflow.keras.models import Model
import efficientnet.tfkeras as efn
new_model =efn.EfficientNetB1(weights='imagenet',include_top=False)
input_channel = 4
config = new_model.get_config()
config["layers"][0]["config"]["batch_input_shape"] = (None, 240, 240, input_channel)
modify_name = config["layers"][1]["config"]["name"]
custom_model = Model.from_config(config)
model = create_hybrid_transfer(trainAttrX.shape[1],new_model,custom_model,modify_name, input_channel)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
```
%% Cell type:code id: tags:
``` python
%%time
hist = model.fit(
x=[trainAttrX, trainImagesX], y=trainY,
validation_data=([testAttrX, testImagesX], testY),
epochs=10, batch_size=8)
```
%% Output
Epoch 1/10
18/18 [==============================] - 15s 573ms/step - loss: 124.2311 - val_loss: 96.4176
Epoch 2/10
18/18 [==============================] - 9s 477ms/step - loss: 105.6208 - val_loss: 99.0469
Epoch 3/10
18/18 [==============================] - 8s 476ms/step - loss: 97.7730 - val_loss: 100.0893
Epoch 4/10
18/18 [==============================] - 9s 502ms/step - loss: 99.6281 - val_loss: 100.3717
Epoch 5/10
18/18 [==============================] - 9s 492ms/step - loss: 100.0902 - val_loss: 100.8106
Epoch 6/10
18/18 [==============================] - 9s 517ms/step - loss: 99.1478 - val_loss: 101.0832
Epoch 7/10
18/18 [==============================] - 8s 475ms/step - loss: 98.8431 - val_loss: 101.3206
Epoch 8/10
18/18 [==============================] - 9s 478ms/step - loss: 96.1802 - val_loss: 101.6055
Epoch 9/10
18/18 [==============================] - 8s 473ms/step - loss: 97.2021 - val_loss: 101.5631
Epoch 10/10
18/18 [==============================] - 8s 470ms/step - loss: 96.3316 - val_loss: 101.3311
CPU times: user 7min 59s, sys: 21.3 s, total: 8min 21s
Wall time: 1min 33s
%% Cell type:code id: tags:
``` python
from postprocessing.plot_history import plot_history
plot_history(hist)
```
%% Output
%% Cell type:markdown id: tags:
# F - Evaluation
%% Cell type:code id: tags:
``` python
from postprocessing.evaluate import evaluate_hybrid, compute_score
```
%% Cell type:code id: tags:
``` python
preds = evaluate_hybrid(model, df, trainAttrX, trainImagesX, trainY, sc)
conf, score = compute_score(trainY,preds.flatten())
print('competition score :', score)
```
%% Output
INFO:predicting ...
avg. FVC: 2771.744318181818, std FVC 835.5745106360505
mean difference : 26.62%, std: 28.28%
competition score : -4.6106589656078985
%% Cell type:code id: tags:
``` python
model.evaluate([trainAttrX, trainImagesX], trainY)
```
%% Output
5/5 [==============================] - 7s 1s/step - loss: 109.3192
109.31915283203125
%% Cell type:markdown id: tags:
### Test set
%% Cell type:code id: tags:
``` python
preds = evaluate_hybrid(model, df, testAttrX, testImagesX, testY, sc)
conf, score = compute_score(testY,preds.flatten())
print('competition score :', score)
```
%% Output
INFO:predicting ...
avg. FVC: 2771.744318181818, std FVC 835.5745106360505
mean difference : 32.22%, std: 26.99%
competition score : -4.6118354434598965
%% Cell type:code id: tags:
``` python
model.evaluate([testAttrX, testImagesX], testY)
```
%% Output
2/2 [==============================] - 2s 204ms/step - loss: 101.3311
101.33109283447266
%% Cell type:code id: tags:
``` python
```
......
%% Cell type:markdown id: tags:
# CNN with transfer learning with no weights (efficientnet)
This notebook contains the configurations required to train an efficientnet model for K-folds.
It is possible to hit -0.6910 LB by tweaking parameters in this notebook!
https://www.kaggle.com/khoongweihao/k-fold-tf-efficientnet-models-training
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import os
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
```
%% Cell type:markdown id: tags:
## A - Preprocessing : Reading Data
%% Cell type:code id: tags:
``` python
os.chdir('../')
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import read_data
input_directory='../osic-pulmonary-fibrosis-progression'
train_df, test_df, sample_df = read_data(input_directory)
train_df.head()
```
%% Output
Patient Weeks FVC Percent Age Sex SmokingStatus
0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker
1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker
2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker
3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker
4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker
%% Cell type:markdown id: tags:
## B - Preprocessing : Loading Data
%% Cell type:code id: tags:
``` python
patients_train_ids= train_df.Patient.unique()
patient_test_list= test_df.Patient.unique()
patients_train_ids = [pat for pat in patients_train_ids]
```
%% Cell type:code id: tags:
``` python
from preprocessing.read_load_data import load_images
logging.info("loading attributes...")
df = pd.read_csv(f'{input_directory}/train.csv')
df = df.drop_duplicates(subset = 'Patient', keep='first')
patients_train_ids= df.Patient.unique().tolist()
df = df[df['Patient'].isin(patients_train_ids)]
logging.info("loading images...")
images = load_images(input_directory,
'train',
patients_train_ids,
option='superposition',
outputH = 240,
outputW = 240)
print("Array shape: ", images.shape)
#check value between -1,1
print('min value: ', np.amin(images))
print('max value: ', np.amax(images))
```
%% Output
INFO:loading attributes...
INFO:loading images...
Array shape: (176, 240, 240, 4)
min value: -0.1251496147096971
max value: 0.1692184837618322
%% Cell type:code id: tags:
``` python
df.head()
```
%% Output
Patient Weeks FVC Percent Age Sex \
0 ID00007637202177411956430 -4 2315 58.253649 79 Male
9 ID00009637202177434476278 8 3660 85.282878 69 Male
18 ID00010637202177584971671 0 3523 94.724672 60 Male
27 ID00011637202177653955184 6 3326 85.987590 72 Male
36 ID00012637202177665765362 33 3418 93.726006 65 Male
... ... ... ... ... ... ...
1504 ID00419637202311204720264 6 3020 70.186855 73 Male
1513 ID00421637202311550012437 15 2739 82.045291 68 Male
1523 ID00422637202311677017371 6 1930 76.672493 73 Male
1531 ID00423637202312137826377 17 3294 79.258903 72 Male
1540 ID00426637202313170790466 0 2925 71.824968 73 Male
SmokingStatus
0 Ex-smoker
9 Ex-smoker
18 Ex-smoker
27 Ex-smoker
36 Never smoked
... ...
1504 Ex-smoker
1513 Ex-smoker
1523 Ex-smoker
1531 Ex-smoker
1540 Never smoked
[176 rows x 7 columns]
%% Cell type:markdown id: tags:
## C - Preprocessing : shuffle
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import train_test_split
split = train_test_split(df, images, test_size=0.2, random_state=42)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
```
%% Cell type:markdown id: tags:
## D - Preprocessing : Scaling + Encoding
%% Cell type:code id: tags:
``` python
from preprocessing.scale_data import scale_variable
sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'FVC')
trainY = trainAttrX.loc[:,'FVC_scaled']
testY = testAttrX.loc[:,'FVC_scaled']
```
%% Cell type:markdown id: tags:
## E - Processing : Create models
%% Cell type:code id: tags:
``` python
from processing.models import create_transfer_learning
from keras.optimizers import Adam
from tensorflow.keras.models import Model
import efficientnet.tfkeras as efn
new_model =efn.EfficientNetB1(weights=None,include_top=False)
input_channel = 4
config = new_model.get_config()
config["layers"][0]["config"]["batch_input_shape"] = (None, 240, 240, input_channel)
modify_name = config["layers"][1]["config"]["name"]
custom_model = Model.from_config(config)
model = create_transfer_learning(new_model,custom_model,modify_name, input_channel)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
```
%% Cell type:code id: tags:
``` python
%%time
hist = model.fit(x=trainImagesX, y=trainY,
validation_data=(testImagesX, testY),
epochs=20, batch_size=8)
```
%% Output
Epoch 1/20
18/18 [==============================] - 45s 914ms/step - loss: 99.7389 - val_loss: 100.4144
Epoch 2/20
18/18 [==============================] - 13s 757ms/step - loss: 98.7043 - val_loss: 100.6795
Epoch 3/20
18/18 [==============================] - 14s 813ms/step - loss: 99.9509 - val_loss: 100.7490
Epoch 4/20
18/18 [==============================] - 11s 632ms/step - loss: 98.1777 - val_loss: 100.8061
Epoch 5/20
18/18 [==============================] - 9s 498ms/step - loss: 99.5343 - val_loss: 100.7976
Epoch 6/20
18/18 [==============================] - 9s 504ms/step - loss: 99.6414 - val_loss: 100.7194
Epoch 7/20
18/18 [==============================] - 9s 482ms/step - loss: 98.7556 - val_loss: 100.6924
Epoch 8/20
18/18 [==============================] - 8s 469ms/step - loss: 99.1854 - val_loss: 100.7468
Epoch 9/20
18/18 [==============================] - 9s 509ms/step - loss: 98.0266 - val_loss: 100.8150
Epoch 10/20
18/18 [==============================] - 9s 528ms/step - loss: 98.6620 - val_loss: 100.6718
Epoch 11/20
18/18 [==============================] - 8s 474ms/step - loss: 97.1374 - val_loss: 100.6786
Epoch 12/20
18/18 [==============================] - 9s 494ms/step - loss: 99.4677 - val_loss: 100.6250
Epoch 13/20
18/18 [==============================] - 9s 491ms/step - loss: 99.0034 - val_loss: 100.6607
Epoch 14/20
18/18 [==============================] - 9s 477ms/step - loss: 98.6438 - val_loss: 100.6357
Epoch 15/20
18/18 [==============================] - 8s 469ms/step - loss: 99.4131 - val_loss: 100.5815
Epoch 16/20
18/18 [==============================] - 9s 500ms/step - loss: 98.8292 - val_loss: 100.6375
Epoch 17/20
18/18 [==============================] - 8s 448ms/step - loss: 99.5671 - val_loss: 100.6383
Epoch 18/20
18/18 [==============================] - 8s 452ms/step - loss: 99.8388 - val_loss: 100.7089
Epoch 19/20
18/18 [==============================] - 8s 452ms/step - loss: 97.6967 - val_loss: 100.8448
Epoch 20/20
18/18 [==============================] - 8s 448ms/step - loss: 96.8201 - val_loss: 100.8306
CPU times: user 17min 3s, sys: 59.7 s, total: 18min 3s
Wall time: 3min 53s
%% Cell type:code id: tags:
``` python
from postprocessing.plot_history import plot_history
plot_history(hist)
```
%% Output
%% Cell type:markdown id: tags:
# F - Evaluation
%% Cell type:code id: tags:
``` python
from postprocessing.evaluate import evaluate_cnn, compute_score
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-097459c72699> in <module>
----> 1 from postprocessing.evaluate import evaluate_cnn, compute_score
ModuleNotFoundError: No module named 'postprocessing'
%% Cell type:code id: tags:
``` python
preds = evaluate_cnn(model, df, trainImagesX, trainY, sc)
conf, score = compute_score(trainY,preds.flatten())
print('competition score :', score)
```
%% Output
INFO:predicting ...
avg. FVC: 2771.744318181818, std FVC 835.5745106360505
mean difference : 26.66%, std: 27.72%
competition score : -4.610712300068822
%% Cell type:code id: tags:
``` python
model.evaluate(trainImagesX, trainY)
```
%% Output
5/5 [==============================] - 8s 1s/step - loss: 98.6242
98.62422180175781
%% Cell type:markdown id: tags:
### Test set
%% Cell type:code id: tags:
``` python
preds = evaluate_cnn(model, df, testImagesX, testY, sc)
conf, score = compute_score(testY,preds.flatten())
print('competition score :', score)
```
%% Output
INFO:predicting ...
avg. FVC: 2771.744318181818, std FVC 835.5745106360505
mean difference : 31.99%, std: 26.69%
competition score : -4.6117394238573475
%% Cell type:code id: tags:
``` python
model.evaluate(testImagesX, testY)
```
%% Output
2/2 [==============================] - 2s 179ms/step - loss: 100.8306
100.83056640625
......
......@@ -115,3 +115,32 @@ def load_images(input_directory,
images.append(outputImage)
return np.array(images)
def create_dataframe(df):
# new dataframe with one row per patient for training
train_data = []
patientlist = df.Patient.unique().tolist()
for patient in patientlist:
#select all data related to a patient
patientData = df[df['Patient'] == patient]
# save first measurements
firstMeasure = list(patientData.iloc[0, :].values)
#for ech measurement, add fist measurement and duration since first measurement
for i, week in enumerate(patientData['Weeks'].iloc[1:]):
fvc = patientData.iloc[i+1, 2]
trainDataPoint = firstMeasure + [week, fvc]
train_data.append(trainDataPoint)
training_df = pd.DataFrame(train_data)
training_df.columns = ['PatientID', 'First_week', 'First_FVC', 'First_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'Target_FVC']
training_df['Delta_week'] = training_df['target_week'] - training_df['First_week']
#rearrange columns
training_df = training_df[['PatientID','Age','Sex','SmokingStatus', 'First_FVC', 'First_Percent','Delta_week','Target_FVC']]
return training_df
......@@ -110,7 +110,7 @@ def weightify(model_orig, custom_model, layer_modify,input_channel):
target_layer= Conv2D(32, (3, 3), activation='relu', padding='valid',use_bias=False)
input_shape = TensorShape([None, 240, 240, 4]) # to define h, w, c based on shape of layer input
input_shape = TensorShape([None, 240, 240, input_channel]) # to define h, w, c based on shape of layer input
target_layer.build(input_shape)
target_layer.set_weights([kernels_extra_channel])
#target_layer.set_weights([kernels_extra_channel, biases])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment