Commit 07570917 authored by Billy Amélie's avatar Billy Amélie
Browse files

replace old regression

parent ac7a5aab
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import logging\n",
"logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"os.chdir('../')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Patient</th>\n",
" <th>Weeks</th>\n",
" <th>FVC</th>\n",
" <th>Percent</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>-4</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>5</td>\n",
" <td>2214</td>\n",
" <td>55.712129</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>7</td>\n",
" <td>2061</td>\n",
" <td>51.862104</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>9</td>\n",
" <td>2144</td>\n",
" <td>53.950679</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>11</td>\n",
" <td>2069</td>\n",
" <td>52.063412</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Patient Weeks FVC Percent Age Sex SmokingStatus\n",
"0 ID00007637202177411956430 -4 2315 58.253649 79 Male Ex-smoker\n",
"1 ID00007637202177411956430 5 2214 55.712129 79 Male Ex-smoker\n",
"2 ID00007637202177411956430 7 2061 51.862104 79 Male Ex-smoker\n",
"3 ID00007637202177411956430 9 2144 53.950679 79 Male Ex-smoker\n",
"4 ID00007637202177411956430 11 2069 52.063412 79 Male Ex-smoker"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from preprocessing.read_load_data import read_data\n",
"\n",
"input_directory='../osic-pulmonary-fibrosis-progression'\n",
"train_df, test_df, sample_df = read_data(input_directory)\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"patients_train_ids= train_df.Patient.unique()\n",
"patient_test_list= test_df.Patient.unique()\n",
"patients_train_ids = [pat for pat in patients_train_ids]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(f'{input_directory}/train.csv')\n",
"patients_train_ids= df.Patient.unique().tolist()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"split = train_test_split(patients_train_ids, test_size=0.2, random_state=42)\n",
"(trainPatient, testPatient) = split"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df_train = df[df.Patient.isin(trainPatient)].copy()\n",
"df_test = df[df.Patient.isin(testPatient)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:NumExpr defaulting to 8 threads.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1093, 8) (280, 8)\n"
]
}
],
"source": [
"from preprocessing.read_load_data import create_dataframe\n",
"\n",
"trainAttrX = create_dataframe(df_train)\n",
"testAttrX = create_dataframe(df_test)\n",
"print(trainAttrX.shape, testAttrX.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientID</th>\n",
" <th>Age</th>\n",
" <th>Sex</th>\n",
" <th>SmokingStatus</th>\n",
" <th>First_FVC</th>\n",
" <th>First_Percent</th>\n",
" <th>Delta_week</th>\n",
" <th>Target_FVC</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>9</td>\n",
" <td>2214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>11</td>\n",
" <td>2061</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID00007637202177411956430</td>\n",
" <td>79</td>\n",
" <td>Male</td>\n",
" <td>Ex-smoker</td>\n",
" <td>2315</td>\n",
" <td>58.253649</td>\n",
" <td>13</td>\n",
" <td>2144</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PatientID Age Sex SmokingStatus First_FVC \\\n",
"0 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"1 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"2 ID00007637202177411956430 79 Male Ex-smoker 2315 \n",
"\n",
" First_Percent Delta_week Target_FVC \n",
"0 58.253649 9 2214 \n",
"1 58.253649 11 2061 \n",
"2 58.253649 13 2144 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainAttrX.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing.scale_data import scale_variable\n",
"\n",
"sc, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'Target_FVC')\n",
"sc1, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'First_FVC')\n",
"sc2, trainAttrX, testAttrX = scale_variable(trainAttrX, testAttrX,'Age')\n",
"\n",
"trainY = trainAttrX.loc[:,'Target_FVC_scaled']\n",
"testY = testAttrX.loc[:,'Target_FVC_scaled']"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing.scale_data import encode_variable\n",
"\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'Sex')\n",
"trainAttrX, testAttrX = encode_variable(trainAttrX, testAttrX,'SmokingStatus')\n",
"\n",
"for dft in [trainAttrX,testAttrX]:\n",
" dft.drop(columns = ['Sex','SmokingStatus','Target_FVC','Target_FVC_scaled',\n",
" 'PatientID','First_FVC','Age'], inplace = True)\n",
" dft.loc[:,'First_Percent'] = dft.loc[:,'First_Percent']/100\n",
" dft.loc[:,'Delta_week'] = dft.loc[:,'Delta_week']/133"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>First_Percent</th>\n",
" <th>Delta_week</th>\n",
" <th>First_FVC_scaled</th>\n",
" <th>Age_scaled</th>\n",
" <th>Sex_le</th>\n",
" <th>SmokingStatus_le</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.582536</td>\n",
" <td>0.067669</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.582536</td>\n",
" <td>0.082707</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.582536</td>\n",
" <td>0.097744</td>\n",
" <td>-0.631784</td>\n",
" <td>1.684379</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" First_Percent Delta_week First_FVC_scaled Age_scaled Sex_le \\\n",
"0 0.582536 0.067669 -0.631784 1.684379 1 \n",
"1 0.582536 0.082707 -0.631784 1.684379 1 \n",
"2 0.582536 0.097744 -0.631784 1.684379 1 \n",
"\n",
" SmokingStatus_le \n",
"0 1 \n",
"1 1 \n",
"2 1 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainAttrX.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training accuracy : 0.9232693163187786\n",
"Test accuracy : 0.9251618956848909\n"
]
}
],
"source": [
"from processing.models import create_regression\n",
"\n",
"model = LinearRegression()\n",
"create_regression(trainAttrX, trainY, testAttrX, testY, model)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training accuracy : 0.967574654530503\n",
"Test accuracy : 0.8966102524853241\n"
]
}
],
"source": [
"regr = RandomForestRegressor(max_depth=6, random_state=0)\n",
"create_regression(trainAttrX, trainY, testAttrX, testY, regr)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training accuracy : 0.9738754734561159\n",
"Test accuracy : 0.7923044787788517\n"
]
}
],
"source": [
"neigh = KNeighborsRegressor(n_neighbors=4)\n",
"create_regression(trainAttrX, trainY, testAttrX, testY, neigh)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE : 0.06708833836396759\n",
"RMSE : 0.2590141663383831\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"#with scaled fvc\n",
"\n",
"preds = model.predict(testAttrX)\n",
"diff = preds - testY\n",
"MSE = np.mean(diff*diff)\n",
"RMSE = np.sqrt(MSE)\n",
"print(\"MSE : \", MSE)\n",
"print(\"RMSE : \", RMSE)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE : 46665.02597617758\n",
"RMSE : 216.0208924529699\n"
]
}
],
"source": [
"# unscaled FVC\n",
"\n",
"FVC_preds = sc.inverse_transform(preds)\n",
"diff = FVC_preds - sc.inverse_transform(testY)\n",
"MSE = np.mean(diff*diff)\n",
"RMSE = np.sqrt(MSE)\n",
"print(\"MSE : \", MSE)\n",
"print(\"RMSE : \", RMSE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment