Commit f3122ca1 authored by Bannier Delphine's avatar Bannier Delphine
Browse files

Merge branch 'DelphineBranch' into 'master'

Delphine branch

See merge request !15
parents 911189dd ef6cea31
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -31,102 +31,22 @@
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Patient</th>\n",
" <th>Weeks</th>\n",
" <th>FVC</th>\n",
" <th>Percent</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>-4</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>5</td>\n",
" <td>2214</td>\n",
" <td>55.712129</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>7</td>\n",
" <td>2061</td>\n",
" <td>51.862104</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>9</td>\n",
" <td>2144</td>\n",
" <td>53.950679</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>11</td>\n",
" <td>2069</td>\n",
" <td>52.063412</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Patient Weeks FVC Percent Age Sex SmokingStatus\n",
"0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker\n",
"1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker\n",
"2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker\n",
"3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker\n",
"4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
"ename": "FileNotFoundError",
"evalue": "[Errno 2] File ../osic-pulmonary-fibrosis-progression/test.csv does not exist: '../osic-pulmonary-fibrosis-progression/test.csv'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-95e5e9a2d8bf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Reading data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtest_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{input_directory}/test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0msample_sub\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{input_directory}/sample_submission.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtrain_df\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{input_directory}/train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtrain_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 674\u001b[0m )\n\u001b[1;32m 675\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 676\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 678\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 448\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfp_or_buf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 449\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 878\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 880\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 881\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 882\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1112\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"c\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1113\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"c\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1114\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1115\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1116\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"python\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1889\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"usecols\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1890\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1891\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1892\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1893\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] File ../osic-pulmonary-fibrosis-progression/test.csv does not exist: '../osic-pulmonary-fibrosis-progression/test.csv'"
]
}
],
"source": [
......
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import os
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
```
%% Cell type:code id: tags:
``` python
input_directory='../osic-pulmonary-fibrosis-progression'
```
%% Cell type:code id: tags:
``` python
# Reading data
test_df = pd.read_csv(f'{input_directory}/test.csv')
sample_sub= pd.read_csv(f'{input_directory}/sample_submission.csv')
train_df= pd.read_csv(f'{input_directory}/train.csv')
train_df.head()
```
%%%% Output: execute_result
%%%% Output: error
Patient Weeks FVC Percent Age Sex SmokingStatus
0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker
1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker
2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker
3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker
4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-3-95e5e9a2d8bf> in <module>
1 # Reading data
----> 2 test_df = pd.read_csv(f'{input_directory}/test.csv')
3 sample_sub= pd.read_csv(f'{input_directory}/sample_submission.csv')
4 train_df= pd.read_csv(f'{input_directory}/train.csv')
5 train_df.head()
~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
674 )
675
--> 676 return _read(filepath_or_buffer, kwds)
677
678 parser_f.__name__ = name
~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
446
447 # Create the parser.
--> 448 parser = TextFileReader(fp_or_buf, **kwds)
449
450 if chunksize or iterator:
~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
878 self.options["has_index_names"] = kwds["has_index_names"]
879
--> 880 self._make_engine(self.engine)
881
882 def close(self):
~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
1112 def _make_engine(self, engine="c"):
1113 if engine == "c":
-> 1114 self._engine = CParserWrapper(self.f, **self.options)
1115 else:
1116 if engine == "python":
~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1889 kwds["usecols"] = self.usecols
1890
-> 1891 self._reader = parsers.TextReader(src, **kwds)
1892 self.unnamed_cols = self._reader.unnamed_cols
1893
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] File ../osic-pulmonary-fibrosis-progression/test.csv does not exist: '../osic-pulmonary-fibrosis-progression/test.csv'
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
patients_train_ids= train_df.Patient.unique()
print(len(patients_train_ids))
```
%%%% Output: stream
176
%% Cell type:code id: tags:
``` python
le = LabelEncoder()
le.fit(train_df['Sex'])
print(list(le.classes_))
train_df['Sex']=le.transform(train_df['Sex'])
test_df['Sex']=le.transform(test_df['Sex'])
```
%%%% Output: stream
['Female', 'Male']
%% Cell type:code id: tags:
``` python
le = LabelEncoder()
le.fit(train_df['SmokingStatus'])
print(list(le.classes_))
train_df['SmokingStatus']=le.transform(train_df['SmokingStatus'])
test_df['SmokingStatus']=le.transform(test_df['SmokingStatus'])
```
%%%% Output: stream
['Currently smokes', 'Ex-smoker', 'Never smoked']
%% Cell type:code id: tags:
``` python
# new dataframe with one row per patient for training
train_data = []
for patient in patients_train_ids :
#select all data related to a patient
patientData = train_df[train_df['Patient'] == patient]
# save first measurements
firstMeasure = list(patientData.iloc[0, :].values)
#for ech measurement, add fist measurement and duration since first measurement
for i, week in enumerate(patientData['Weeks'].iloc[1:]):
fvc, fvc_pctg = patientData.iloc[i, 2], patientData.iloc[i, 3]
trainDataPoint = firstMeasure + [week, fvc]
train_data.append(trainDataPoint)
training_df = pd.DataFrame(train_data)
training_df.columns = ['PatientID', 'First_week', 'First_FVC', 'First_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'Target_FVC']
training_df['Delta_week'] = training_df['target_week'] - training_df['First_week']
#rearrange columns
training_df = training_df[['PatientID','Age','Sex','SmokingStatus', 'First_FVC', 'First_Percent','Delta_week','Target_FVC']]
```
%% Cell type:code id: tags:
``` python
training_df
```
%%%% Output: execute_result
PatientID Age Sex SmokingStatus First_FVC \
0 ID00007637202177411956430 79 1 1 2315
1 ID00007637202177411956430 79 1 1 2315
2 ID00007637202177411956430 79 1 1 2315
3 ID00007637202177411956430 79 1 1 2315
4 ID00007637202177411956430 79 1 1 2315
... ... ... ... ... ...
1368 ID00426637202313170790466 73 1 2 2925
1369 ID00426637202313170790466 73 1 2 2925
1370 ID00426637202313170790466 73 1 2 2925
1371 ID00426637202313170790466 73 1 2 2925
1372 ID00426637202313170790466 73 1 2 2925
First_Percent Delta_week Target_FVC
0 58.253649 9 2315
1 58.253649 11 2214
2 58.253649 13 2061
3 58.253649 15 2144
4 58.253649 21 2069
... ... ... ...
1368 71.824968 13 2976
1369 71.824968 19 2712
1370 71.824968 31 2978
1371 71.824968 43 2908
1372 71.824968 59 2975
[1373 rows x 8 columns]
%% Cell type:code id: tags:
``` python
patient_test_list= test_df.Patient.unique()
patients_train_ids = [pat for pat in patients_train_ids if not pat in patient_test_list]
print(len(patients_train_ids))
```
%%%% Output: stream
171
%% Cell type:code id: tags:
``` python
reduced_test = training_df[training_df.PatientID.isin(patient_test_list)]
training_df = training_df[training_df.PatientID.isin(patients_train_ids)]
```
%% Cell type:code id: tags:
``` python
sample_sub.head() #How the result must look like
```
%%%% Output: execute_result
Patient_Week FVC Confidence
0 ID00419637202311204720264_-12 2000 100
1 ID00421637202311550012437_-12 2000 100
2 ID00422637202311677017371_-12 2000 100
3 ID00423637202312137826377_-12 2000 100
4 ID00426637202313170790466_-12 2000 100
%% Cell type:code id: tags:
``` python
sample_sub[['Patient','Week']] = sample_sub['Patient_Week'].str.split('_',expand=True)
```
%% Cell type:code id: tags:
``` python
sample_sub.head(3)
```
%%%% Output: execute_result
Patient_Week FVC Confidence Patient \
0 ID00419637202311204720264_-12 2000 100 ID00419637202311204720264
1 ID00421637202311550012437_-12 2000 100 ID00421637202311550012437
2 ID00422637202311677017371_-12 2000 100 ID00422637202311677017371
Week
0 -12
1 -12
2 -12
%% Cell type:code id: tags:
``` python
test_df
```
%%%% Output: execute_result
Patient Weeks FVC Percent Age Sex SmokingStatus
0 ID00419637202311204720264 6 3020 70.186855 73 1 1
1 ID00421637202311550012437 15 2739 82.045291 68 1 1
2 ID00422637202311677017371 6 1930 76.672493 73 1 1
3 ID00423637202312137826377 17 3294 79.258903 72 1 1
4 ID00426637202313170790466 0 2925 71.824968 73 1 2
%% Cell type:markdown id: tags:
We will preduct, for each patient the test set, their FVC for the weeks that are in the sample set for them (this is a requirement from the competition):
"For each Patient_Week from the submission file, you must predict the FVC and a confidence"
%% Cell type:code id: tags:
``` python
# create testing data
testData = []
for p in patient_test_list:
patientData = test_df[test_df['Patient'] == p]
firstMeasure = list(patientData.iloc[0, :].values)
s_patient = sample_sub[sample_sub['Patient']==p]
allweeks= s_patient['Week'].tolist()
for week in allweeks:
testDataPoint = firstMeasure + [week]
testData.append(testDataPoint)
testData = pd.DataFrame(testData)
testData.columns = ['PatientID', 'first_week', 'First_FVC', 'First_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['Target_week']
testData['Delta_week'] = testData['Target_week'].map(int) - testData['first_week']
#Rearange
testData = testData[['PatientID','Age','Sex','SmokingStatus', 'First_FVC', 'First_Percent','Delta_week','Target_week']]
```
%% Cell type:code id: tags:
``` python
testData.head(3)
```
%%%% Output: execute_result
PatientID Age Sex SmokingStatus First_FVC \
0 ID00419637202311204720264 73 1 1 3020
1 ID00419637202311204720264 73 1 1 3020
2 ID00419637202311204720264 73 1 1 3020
First_Percent Delta_week Target_week
0 70.186855 -18 -12
1 70.186855 -17 -11
2 70.186855 -16 -10
%% Cell type:markdown id: tags:
# Model
%% Cell type:code id: tags:
``` python
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
model = LinearRegression()
model.fit(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC'])
print('Training accuracy:',model.score(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC']))
print('Test accuracy:', model.score(reduced_test.drop(columns = ['PatientID', 'Target_FVC']), reduced_test['Target_FVC']))
prediction_test = model.predict(testData.drop(columns = ['PatientID','Target_week']))
prediction_train =model.predict(training_df.drop(columns = ['PatientID', 'Target_FVC']))
prediction_reduced = model.predict(reduced_test.drop(columns = ['PatientID', 'Target_FVC']))
```
%%%% Output: stream
Training accuracy: 0.9436964004184819
Test accuracy: 0.5474056880757807
%% Cell type:code id: tags:
``` python
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=6, random_state=0)
regr.fit(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC'])
print('Training accuracy:',regr.score(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC']))
print('Test accuracy:', regr.score(reduced_test.drop(columns = ['PatientID', 'Target_FVC']), reduced_test['Target_FVC']))
```
%%%% Output: stream
Training accuracy: 0.9709061975020706
Test accuracy: 0.42720163750014606
%% Cell type:code id: tags:
``` python
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC'])
print('Training accuracy:',svr_rbf.score(training_df.drop(columns = ['PatientID', 'Target_FVC']), training_df['Target_FVC']))
print('Test accuracy:', svr_rbf.score(reduced_test.drop(columns = ['PatientID', 'Target_FVC']), reduced_test['Target_FVC']))
```
%%%% Output: stream
Training accuracy: 0.2596115855061234
Test accuracy: 0.309766996737311
%% Cell type:markdown id: tags:
# Evaluation
%% Cell type:markdown id: tags:
This competition is evaluated on a modified version of the Laplace Log Likelihood. In medical applications, it is useful to evaluate a model's confidence in its decisions. Accordingly, the metric is designed to reflect both the accuracy and certainty of each prediction.For each true FVC measurement, you will predict both an FVC and a confidence measure (standard deviation $\sigma$). The metric is computed as:
$\sigma_{clipped} = max(\sigma, 70)$,
$\Delta = min ( |FVC_{true} - FVC_{predicted}|, 1000 )$
$metric = - \frac{\sqrt{2} \Delta}{\sigma_{clipped}} - \ln ( \sqrt{2} \sigma_{clipped} )$
The final score is calculated by averaging the metric across all test set Patient_Weeks (three per patient). Note that metric values will be negative and higher is better.
%% Cell type:code id: tags:
``` python
import math
def compute_score(y_true, y_pred):
sigma = ( y_true - y_pred ) #########
fvc_pred = y_pred
sigma_clip = np.maximum(sigma, 70)
delta = np.minimum(abs(y_true - fvc_pred),1000)
sq2 = math.sqrt(2)
metric = -(delta / sigma_clip)*sq2 - np.log(sigma_clip* sq2)
return (sigma, np.mean(metric))
```
%% Cell type:code id: tags:
``` python
#training set
conf,score = compute_score(prediction_train,training_df['Target_FVC'])
print(score)
```
%%%% Output: stream
-6.712895211485835
%% Cell type:code id: tags:
``` python
conf,score = compute_score(training_df['Target_FVC'],prediction_train)
score
```
%%%% Output: execute_result
-6.839608556542642
%% Cell type:code id: tags:
``` python
#test set
conf,scoretest = compute_score(prediction_reduced,reduced_test['Target_FVC'])
print(scoretest)
conf,scoretest = compute_score(reduced_test['Target_FVC'],prediction_reduced)
print(scoretest)
```
%%%% Output: stream
-6.661457405359661
-7.871138758554059
%% Cell type:code id: tags:
``` python
sub = []
for i in range(testData.shape[0]):
patient, week, pred = testData.loc[i, 'PatientID'], testData.loc[i, 'Target_week'], prediction_test[i]
confidence = 0 #TBD
sub.append([patient + '_' + str(week), pred, confidence])
sub = pd.DataFrame(sub)
sub.columns = ['Patient_Week', 'FVC_pred', 'Confidence']
```
%% Cell type:code id: tags:
``` python
sub
```
%%%% Output: execute_result
Patient_Week FVC_pred Confidence
0 ID00419637202311204720264_-12 3040.606972 0
1 ID00419637202311204720264_-11 3037.872085 0
2 ID00419637202311204720264_-10 3035.137199 0
3 ID00419637202311204720264_-9 3032.402312 0
4 ID00419637202311204720264_-8 3029.667425 0
.. ... ... ...
725 ID00426637202313170790466_129 2532.885281 0
726 ID00426637202313170790466_130 2530.150395 0
727 ID00426637202313170790466_131 2527.415508 0
728 ID00426637202313170790466_132 2524.680621 0
729 ID00426637202313170790466_133 2521.945735 0
[730 rows x 3 columns]
%% Cell type:code id: tags:
``` python
patient_test_list = patient_test_list.tolist()
```
%% Cell type:code id: tags:
``` python
train_df_ini= pd.read_csv(f'{input_directory}/train.csv')
train_df_ini[train_df_ini.Patient.isin(patient_test_list)]
```
%%%% Output: execute_result
Patient Weeks FVC Percent Age Sex \
1504 ID00419637202311204720264 6 3020 70.186855 73 Male
1505 ID00419637202311204720264 7 2859 66.445106 73 Male
1506 ID00419637202311204720264 9 2783 64.678814 73 Male
1507 ID00419637202311204720264 10 2719 63.191410 73 Male
1508 ID00419637202311204720264 13 2738 63.632983 73 Male
1509 ID00419637202311204720264 18 2694 62.610393 73 Male
1510 ID00419637202311204720264 31 2708 62.935763 73 Male
1511 ID00419637202311204720264 43 2793 64.911221 73 Male
1512 ID00419637202311204720264 59 2727 63.377336 73 Male
1513 ID00421637202311550012437 15 2739 82.045291 68 Male
1514 ID00421637202311550012437 17 2756 82.554517 68 Male
1515 ID00421637202311550012437 19 2755 82.524563 68 Male
1516 ID00421637202311550012437 21 2820 84.471603 68 Male
1517 ID00421637202311550012437 23 2853 85.460101 68 Male
1518 ID00421637202311550012437 29 2716 81.356338 68 Male
1519 ID00421637202311550012437 41 2833 84.861011 68 Male
1520 ID00421637202311550012437 54 2771 83.003834 68 Male
1521 ID00421637202311550012437 70 2628 78.720345 68 Male
1522 ID00421637202311550012437 70 2719 81.446202 68 Male
1523 ID00422637202311677017371 6 1930 76.672493 73 Male
1524 ID00422637202311677017371 11 1936 76.910853 73 Male
1525 ID00422637202311677017371 13 1955 77.665660 73 Male
1526 ID00422637202311677017371 15 1848 73.414905 73 Male
1527 ID00422637202311677017371 17 1897 75.361513 73 Male
1528 ID00422637202311677017371 23 1946 77.308120 73 Male
1529 ID00422637202311677017371 35 1862 73.971079 73 Male
1530 ID00422637202311677017371 47 1713 68.051804 73 Male
1531 ID00423637202312137826377 17 3294 79.258903 72 Male
1532 ID00423637202312137826377 18 2777 66.819057 72 Male
1533 ID00423637202312137826377 19 2700 64.966314 72 Male
1534 ID00423637202312137826377 21 3014 72.521655 72 Male
1535 ID00423637202312137826377 23 2661 64.027911 72 Male
1536 ID00423637202312137826377 30 2778 66.843118 72 Male
1537 ID00423637202312137826377 42 2516 60.538980 72 Male
1538 ID00423637202312137826377 53 2432 58.517806 72 Male
1539 ID00423637202312137826377 70 2578 62.030799 72 Male
1540 ID00426637202313170790466 0 2925 71.824968 73 Male
1541 ID00426637202313170790466 7 2903 71.284746 73 Male
1542 ID00426637202313170790466 9 2916 71.603968 73 Male
1543 ID00426637202313170790466 11 2976 73.077301 73 Male
1544 ID00426637202313170790466 13 2712 66.594637 73 Male
1545 ID00426637202313170790466 19 2978 73.126412 73 Male
1546 ID00426637202313170790466 31 2908 71.407524 73 Male
1547 ID00426637202313170790466 43 2975 73.052745 73 Male
1548 ID00426637202313170790466 59 2774 68.117081 73 Male
SmokingStatus
1504 Ex-smoker
1505 Ex-smoker
1506 Ex-smoker
1507 Ex-smoker
1508 Ex-smoker
1509 Ex-smoker
1510 Ex-smoker
1511 Ex-smoker
1512 Ex-smoker
1513 Ex-smoker
1514 Ex-smoker
1515 Ex-smoker
1516 Ex-smoker
1517 Ex-smoker
1518 Ex-smoker
1519 Ex-smoker
1520 Ex-smoker
1521 Ex-smoker
1522 Ex-smoker
1523 Ex-smoker
1524 Ex-smoker
1525 Ex-smoker
1526 Ex-smoker
1527 Ex-smoker
1528 Ex-smoker
1529 Ex-smoker
1530 Ex-smoker
1531 Ex-smoker
1532 Ex-smoker
1533 Ex-smoker
1534 Ex-smoker
1535 Ex-smoker
1536 Ex-smoker
1537 Ex-smoker
1538 Ex-smoker
1539 Ex-smoker
1540 Never smoked
1541 Never smoked
1542 Never smoked
1543 Never smoked
1544 Never smoked
1545 Never smoked
1546 Never smoked
1547 Never smoked
1548 Never smoked
%% Cell type:code id: tags:
``` python
df1 = sub.sort_values(by=['Patient_Week'], ascending=True).reset_index(drop=True)
```
%% Cell type:code id: tags:
``` python
df1
```
%%%% Output: execute_result
Patient_Week FVC_pred Confidence
0 ID00419637202311204720264_-1 3010.523218 0
1 ID00419637202311204720264_-10 3035.137199 0
2 ID00419637202311204720264_-11 3037.872085 0
3 ID00419637202311204720264_-12 3040.606972 0
4 ID00419637202311204720264_-2 3013.258105 0
.. ... ... ...
725 ID00426637202313170790466_95 2625.871428 0
726 ID00426637202313170790466_96 2623.136542 0
727 ID00426637202313170790466_97 2620.401655 0
728 ID00426637202313170790466_98 2617.666768 0
729 ID00426637202313170790466_99 2614.931882 0
[730 rows x 3 columns]
%% Cell type:code id: tags:
``` python
```
......
......@@ -143,4 +143,4 @@ def create_dataframe(df):
#rearrange columns
training_df = training_df[['PatientID','Age','Sex','SmokingStatus', 'First_FVC', 'First_Percent','Delta_week','Target_FVC']]
return training_df
return training_df
\ No newline at end of file
......@@ -10,9 +10,11 @@ from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import GaussianNoise
from tensorflow import TensorShape
import numpy as np
from tensorflow import TensorShape
from sklearn.model_selection import train_test_split
def create_cnn(width, height, depth, filters=(32, 64, 128), regress=False):
......@@ -64,6 +66,15 @@ def create_mlp(dim, regress=True):
# return our model
return model
def create_mlp2(dim,regress = True):
model = Sequential()
model.add(GaussianNoise(0.2, input_shape=(dim,)))
model.add(Dense(8, activation="relu"))
model.add(Dense(4, activation="relu"))
# add dense for regression
model.add(Dense(1))
return model
def create_hybrid(nb_attributes,shape=(240,240,1)):
# create cnn and mlp models
mlp = create_mlp(nb_attributes)
......@@ -76,6 +87,7 @@ def create_hybrid(nb_attributes,shape=(240,240,1)):
def multify_weights(kernel, out_channels):
# Expand weights dimension to match new input channels
mean_1d = np.mean(kernel, axis=-2).reshape(kernel[:,:,-1:,:].shape)
......@@ -147,7 +159,21 @@ def create_hybrid_transfer(nb_attributes,new_model, custom_model, modify_name,in
return model
def fit_and_evaluate(t_x, val_x, t_y, val_y, EPOCHS=30, BATCH_SIZE=8,model = None,es= None,cp=None):
"""
`es`: earlystopping keras object
`cp`: checkpoint keras object
"""
mod = None
mod = model
results = mod.fit(t_x, t_y, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es, cp],
verbose=1, validation_split=0.1)
print("Val Score: ", mod.evaluate(val_x, val_y))
return results
def create_regression(training_df_X, training_df_y, test_df_X, test_df_y, model):
model.fit(training_df_X, training_df_y)
print('Training accuracy :', model.score(training_df_X, training_df_y))
print('Test accuracy :', model.score(test_df_X, test_df_y))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment