Commit 07570917 authored by Billy Amélie's avatar Billy Amélie
Browse files

replace old regression

parent ac7a5aab
......@@ -8,25 +8,11 @@
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import math\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from pathlib import Path\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.metrics import mean_absolute_error\n",
"from sklearn.model_selection import train_test_split\n",
"import logging\n",
"logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A - Preprocessing : Reading Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
......@@ -40,26 +26,6 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing.read_load_data import *\n",
"from preprocessing.scale_data import *\n",
"from processing.models import *"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"input_directory='../osic-pulmonary-fibrosis-progression'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
......@@ -155,213 +121,93 @@
"4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker"
]
},
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reading data\n",
"train_df, test_df, sample_sub = read_data(input_directory)\n",
"from preprocessing.read_load_data import read_data\n",
"\n",
"input_directory='../osic-pulmonary-fibrosis-progression'\n",
"train_df, test_df, sample_df = read_data(input_directory)\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Patient</th>\n",
" <th>Weeks</th>\n",
" <th>FVC</th>\n",
" <th>Percent</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00419637202311204720264</td>\n",
" <td>6</td>\n",
" <td>3020</td>\n",
" <td>70.186855</td>\n",
" <td>73</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00421637202311550012437</td>\n",
" <td>15</td>\n",
" <td>2739</td>\n",
" <td>82.045291</td>\n",
" <td>68</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00422637202311677017371</td>\n",
" <td>6</td>\n",
" <td>1930</td>\n",
" <td>76.672493</td>\n",
" <td>73</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00423637202312137826377</td>\n",
" <td>17</td>\n",
" <td>3294</td>\n",
" <td>79.258903</td>\n",
" <td>72</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00426637202313170790466</td>\n",
" <td>0</td>\n",
" <td>2925</td>\n",
" <td>71.824968</td>\n",
" <td>73</td>\n",
" <td>Male</td>\n",
" <td>Never smoked</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Patient Weeks FVC Percent Age Sex SmokingStatus\n",
"0 ID00419637202311204720264 6 3020 70.186855 73 Male Ex-smoker\n",
"1 ID00421637202311550012437 15 2739 82.045291 68 Male Ex-smoker\n",
"2 ID00422637202311677017371 6 1930 76.672493 73 Male Ex-smoker\n",
"3 ID00423637202312137826377 17 3294 79.258903 72 Male Ex-smoker\n",
"4 ID00426637202313170790466 0 2925 71.824968 73 Male Never smoked"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"test_df"
"patients_train_ids= train_df.Patient.unique()\n",
"patient_test_list= test_df.Patient.unique()\n",
"patients_train_ids = [pat for pat in patients_train_ids]"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## B - Preprocessing : Encoding"
"df = pd.read_csv(f'{input_directory}/train.csv')\n",
"patients_train_ids= df.Patient.unique().tolist()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"train_df, test_df = encode_variable(train_df, test_df,'Sex')\n",
"train_df, test_df = encode_variable(train_df, test_df,'SmokingStatus')\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df.drop(columns = ['Sex','SmokingStatus'], inplace = True)\n",
"test_df.drop(columns = ['Sex','SmokingStatus'], inplace = True)"
"split = train_test_split(patients_train_ids, test_size=0.2, random_state=42)\n",
"(trainPatient, testPatient) = split"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"## C - Preprocessing : New variables"
"df_train = df[df.Patient.isin(trainPatient)].copy()\n",
"df_test = df[df.Patient.isin(testPatient)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"176\n"
"INFO:NumExpr defaulting to 8 threads.\n"
]
}
],
"source": [
"patients_train_ids = train_df.Patient.unique()\n",
"print(len(patients_train_ids))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
},
{
"name": "stderr",
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:NumExpr defaulting to 8 threads.\n"
"(1093, 8) (280, 8)\n"
]
}
],
"source": [
"# new dataframe with one row per patient for training\n",
"\n",
"train_data = []\n",
"\n",
"for patient in patients_train_ids : \n",
" \n",
" #select all data related to a patient\n",
" patientData = train_df[train_df['Patient'] == patient]\n",
" # save first measurements\n",
" firstMeasure = list(patientData.iloc[0, :].values)\n",
" \n",
" #for ech measurement, add fist measurement and duration since first measurement\n",
" for i, week in enumerate(patientData['Weeks'].iloc[1:]):\n",
" fvc, fvc_pctg = patientData.iloc[i+1, 2], patientData.iloc[i+1, 3]\n",
" trainDataPoint = firstMeasure + [week, fvc]\n",
" train_data.append(trainDataPoint)\n",
"\n",
" \n",
"training_df = pd.DataFrame(train_data)\n",
"training_df.columns = ['PatientID', 'First_week', 'First_FVC', 'First_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'Target_FVC']\n",
"training_df['Delta_week'] = training_df['target_week'] - training_df['First_week']\n",
"from preprocessing.read_load_data import create_dataframe\n",
"\n",
"#rearrange columns\n",
"training_df = training_df[['PatientID','Age','Sex','SmokingStatus', 'First_FVC', 'First_Percent','Delta_week','Target_FVC']]"
"trainAttrX = create_dataframe(df_train)\n",
"testAttrX = create_dataframe(df_test)\n",
"print(trainAttrX.shape, testAttrX.shape)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"metadata": {},
"outputs": [
{
......@@ -400,8 +246,8 @@
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>9</td>\n",
......@@ -411,8 +257,8 @@
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>11</td>\n",
......@@ -422,96 +268,75 @@
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>13</td>\n",
" <td>2144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>15</td>\n",
" <td>2069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>21</td>\n",
" <td>2101</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PatientID Age Sex SmokingStatus First_FVC \\\n",
"0 ID00007637202177411956430 79 1 1 2315 \n",
"1 ID00007637202177411956430 79 1 1 2315 \n",
"2 ID00007637202177411956430 79 1 1 2315 \n",
"3 ID00007637202177411956430 79 1 1 2315 \n",
"4 ID00007637202177411956430 79 1 1 2315 \n",
" PatientID Age Sex SmokingStatus First_FVC \\\n",
"0 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"1 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"2 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"\n",
" First_Percent Delta_week Target_FVC \n",
"0 58.253649 9 2214 \n",
"1 58.253649 11 2061 \n",
"2 58.253649 13 2144 \n",
"3 58.253649 15 2069 \n",
"4 58.253649 21 2101 "
"2 58.253649 13 2144 "
]
},
"execution_count": 10,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_df.head()"
"trainAttrX.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"171\n"
]
}
],
"outputs": [],
"source": [
"patient_test_list = test_df.Patient.unique()\n",
"patients_train_ids = [pat for pat in patients_train_ids if not pat in patient_test_list]\n",
"print(len(patients_train_ids))"
"from preprocessing.scale_data import scale_variable\n",
"\n",
"sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'Target_FVC')\n",
"sc1, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'First_FVC')\n",
"sc2, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'Age')\n",
"\n",
"trainY = trainAttrX.loc[:,'Target_FVC_scaled']\n",
"testY = testAttrX.loc[:,'Target_FVC_scaled']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"reduced_test = training_df[training_df.PatientID.isin(patient_test_list)]\n",
"training_df = training_df[training_df.PatientID.isin(patients_train_ids)]"
"from preprocessing.scale_data import encode_variable\n",
"\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'Sex')\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'SmokingStatus')\n",
"\n",
"for dft in [trainAttrX,testAttrX]:\n",
" dft.drop(columns = ['Sex','SmokingStatus','Target_FVC','Target_FVC_scaled',\n",
" 'PatientID','First_FVC','Age'], inplace = True)\n",
" dft.loc[:,'First_Percent'] = dft.loc[:,'First_Percent']/100\n",
" dft.loc[:,'Delta_week'] = dft.loc[:,'Delta_week']/133"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 23,
"metadata": {},
"outputs": [
{
......@@ -535,1483 +360,187 @@
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientID</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" <th>First_FVC</th>\n",
" <th>First_Percent</th>\n",
" <th>Delta_week</th>\n",
" <th>Target_FVC</th>\n",
" <th>First_FVC_scaled</th>\n",
" <th>Age_scaled</th>\n",
" <th>Sex_le</th>\n",
" <th>SmokingStatus_le</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>0.582536</td>\n",
" <td>0.067669</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>9</td>\n",
" <td>2214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>0.582536</td>\n",
" <td>0.082707</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>11</td>\n",
" <td>2061</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>13</td>\n",
" <td>2144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>15</td>\n",
" <td>2069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>21</td>\n",
" <td>2101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1328</th>\n",
" <td>ID00417637202310901214011</td>\n",
" <td>66</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3357</td>\n",
" <td>82.247158</td>\n",
" <td>7</td>\n",
" <td>3305</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1329</th>\n",
" <td>ID00417637202310901214011</td>\n",
" <td>66</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3357</td>\n",
" <td>82.247158</td>\n",
" <td>13</td>\n",
" <td>3265</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1330</th>\n",
" <td>ID00417637202310901214011</td>\n",
" <td>66</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3357</td>\n",
" <td>82.247158</td>\n",
" <td>25</td>\n",
" <td>3364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1331</th>\n",
" <td>ID00417637202310901214011</td>\n",
" <td>66</td>\n",
" <td>0.582536</td>\n",
" <td>0.097744</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3357</td>\n",
" <td>82.247158</td>\n",
" <td>37</td>\n",
" <td>3240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1332</th>\n",
" <td>ID00417637202310901214011</td>\n",
" <td>66</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3357</td>\n",
" <td>82.247158</td>\n",
" <td>53</td>\n",
" <td>3303</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1333 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" PatientID Age Sex SmokingStatus First_FVC \\\n",
"0 ID00007637202177411956430 79 1 1 2315 \n",
"1 ID00007637202177411956430 79 1 1 2315 \n",
"2 ID00007637202177411956430 79 1 1 2315 \n",
"3 ID00007637202177411956430 79 1 1 2315 \n",
"4 ID00007637202177411956430 79 1 1 2315 \n",
"... ... ... ... ... ... \n",
"1328 ID00417637202310901214011 66 1 2 3357 \n",
"1329 ID00417637202310901214011 66 1 2 3357 \n",
"1330 ID00417637202310901214011 66 1 2 3357 \n",
"1331 ID00417637202310901214011 66 1 2 3357 \n",
"1332 ID00417637202310901214011 66 1 2 3357 \n",
"\n",
" First_Percent Delta_week Target_FVC \n",
"0 58.253649 9 2214 \n",
"1 58.253649 11 2061 \n",
"2 58.253649 13 2144 \n",
"3 58.253649 15 2069 \n",
"4 58.253649 21 2101 \n",
"... ... ... ... \n",
"1328 82.247158 7 3305 \n",
"1329 82.247158 13 3265 \n",
"1330 82.247158 25 3364 \n",
"1331 82.247158 37 3240 \n",
"1332 82.247158 53 3303 \n",
" First_Percent Delta_week First_FVC_scaled Age_scaled Sex_le \\\n",
"0 0.582536 0.067669 -0.631784 1.684379 1 \n",
"1 0.582536 0.082707 -0.631784 1.684379 1 \n",
"2 0.582536 0.097744 -0.631784 1.684379 1 \n",
"\n",
"[1333 rows x 8 columns]"
" SmokingStatus_le \n",
"0 1 \n",
"1 1 \n",
"2 1 "
]
},
"execution_count": 13,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train