In [ ]:
import numpy as np 
import pandas as pd 
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
df=pd.read_csv('/content/drive/MyDrive/data/healthcare-dataset-stroke-data.csv')
In [ ]:
df.shape
Out[ ]:
(5110, 12)
In [ ]:
df.head()
Out[ ]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
In [ ]:
df.describe()
Out[ ]:
id age hypertension heart_disease avg_glucose_level bmi stroke
count 5110.000000 5110.000000 5110.000000 5110.000000 5110.000000 4909.000000 5110.000000
mean 36517.829354 43.226614 0.097456 0.054012 106.147677 28.893237 0.048728
std 21161.721625 22.612647 0.296607 0.226063 45.283560 7.854067 0.215320
min 67.000000 0.080000 0.000000 0.000000 55.120000 10.300000 0.000000
25% 17741.250000 25.000000 0.000000 0.000000 77.245000 23.500000 0.000000
50% 36932.000000 45.000000 0.000000 0.000000 91.885000 28.100000 0.000000
75% 54682.000000 61.000000 0.000000 0.000000 114.090000 33.100000 0.000000
max 72940.000000 82.000000 1.000000 1.000000 271.740000 97.600000 1.000000
In [ ]:
df.columns
Out[ ]:
Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')
In [ ]:
df.columns=df.columns.str.lower()

1.3 Null Values Handling¶

In [ ]:
df.isna().sum()
Out[ ]:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
In [ ]:
df=df.fillna(np.mean(df['bmi']))
df.isna().sum()
Out[ ]:
id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
In [ ]:
df['smoking_status'].replace('Unknown', df['smoking_status'].mode()[0], inplace=True)
In [ ]:
df.drop('id', axis=1, inplace=True)
In [ ]:
df = df[df['gender'] != 'Other']
In [ ]:
numeric_data=df.loc[:,df.nunique() > 5]
cols = [col for col in df.columns if col not in numeric_data]

categorical_data=df[cols].drop('stroke',axis=1)
numeric_data=pd.DataFrame(numeric_data)
categorical_data=pd.DataFrame(categorical_data)
In [ ]:
print(f'Numerical data: {list(numeric_data)}')
print(f'Categorical data: {list(categorical_data)}')
Numerical data: ['age', 'avg_glucose_level', 'bmi']
Categorical data: ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'smoking_status']
In [ ]:
sns.heatmap(df.corr(), cmap = 'Greens',annot=True);
In [ ]:
plt.figure(figsize=(10,6))
ax=sns.countplot(x='smoking_status',data=df, palette='rainbow',hue='stroke')
plt.title("Count of people in each Smoking Group, Separated by Stroke")
for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+50))
In [ ]:
g = sns.catplot(x='work_type',y='stroke', col = 'residence_type', data=df, kind='bar', palette='magma', saturation =2.5)
(g.set_axis_labels("Work Type", "Stroke Rate").set_titles("{col_name}").set(ylim=(0,0.15)))
g.fig.set_figwidth(10)
g.fig.set_figheight(2)
In [ ]:
num_cols=numeric_data.columns.to_list()
sc = StandardScaler()
df[num_cols] = sc.fit_transform(df[num_cols])
In [ ]:
df.head()
Out[ ]:
gender age hypertension heart_disease ever_married work_type residence_type avg_glucose_level bmi smoking_status stroke
0 Male 1.051242 0 1 Yes Private Urban 2.706450 1.001041 formerly smoked 1
1 Female 0.785889 0 0 Yes Self-employed Rural 2.121652 -0.000165 never smoked 1
2 Male 1.626174 0 1 Yes Private Rural -0.004867 0.468399 never smoked 1
3 Female 0.255182 0 0 Yes Private Urban 1.437473 0.715233 smokes 1
4 Female 1.581949 1 0 Yes Self-employed Rural 1.501297 -0.635858 never smoked 1
In [ ]:
le = LabelEncoder()
object_col = [col for col in df.columns if df[col].dtype == 'object']
for col in object_col:
    df[col] = le.fit_transform(df[col])
In [ ]:
df.head()
Out[ ]:
gender age hypertension heart_disease ever_married work_type residence_type avg_glucose_level bmi smoking_status stroke
0 1 1.051242 0 1 1 2 1 2.706450 1.001041 0 1
1 0 0.785889 0 0 1 3 0 2.121652 -0.000165 1 1
2 1 1.626174 0 1 1 2 0 -0.004867 0.468399 1 1
3 0 0.255182 0 0 1 2 1 1.437473 0.715233 2 1
4 0 1.581949 1 0 1 3 0 1.501297 -0.635858 1 1
In [ ]:
training_data=df.copy()
x= training_data.drop(['stroke'],axis=1)
y= df['stroke']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
In [ ]:
sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train,y_train)

4. Modelling

Modelling¶

In [ ]:
lr= LogisticRegression()
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(lr_pred, y_test)
lr_f1 = f1_score(lr_pred, y_test)
lr_acc
Out[ ]:
0.7553816046966731
In [ ]:
decision_tree = DecisionTreeClassifier()   
decision_tree.fit(x_train,y_train)
dt_pred = decision_tree.predict(x_test)
dt_acc = accuracy_score(dt_pred, y_test)
dt_acc
Out[ ]:
0.8522504892367906
In [ ]:
rf = RandomForestClassifier(n_estimators = 25)
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
rf_acc = accuracy_score(rf_pred, y_test)
rf_acc
Out[ ]:
0.8718199608610567
In [ ]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
xgb_acc = accuracy_score(xgb_pred, y_test)
xgb_acc
Out[ ]:
0.7818003913894325
In [ ]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_acc = accuracy_score(knn_pred, y_test)
knn_acc
Out[ ]:
0.7720156555772995
In [ ]:
lgbm = LGBMClassifier(random_state = 42)
lgbm.fit(x_train, y_train)
lgbm_pred = lgbm.predict(x_test)
lgbm_acc = accuracy_score(lgbm_pred, y_test)
lgbm_acc
Out[ ]:
0.9031311154598826
In [ ]:
svm=SVC(random_state=42)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)
svm_acc = accuracy_score(svm_pred, y_test)
svm_acc
Out[ ]:
0.7563600782778865
In [ ]:
models_names = ["LogisticRegression",'DecisionTreeClassifier','RandomForestClassifier','XGBClassifier',
                    'KNeighborsClassifier','LGBMClassifier','SVC']
models_acc=[lr_acc,dt_acc,rf_acc,xgb_acc,knn_acc,lgbm_acc,svm_acc]

plt.rcParams['figure.figsize']=12,6
ax = sns.barplot(x=models_names, y=models_acc, palette = "mako", saturation =1.5)
plt.xlabel('Classifier Models' )
plt.ylabel('Accuracy')
plt.title('Accuracy of different Classifier Models')
plt.xticks(fontsize = 10, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 10)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,5)}', (x + width/2, y + height*1.02), ha='center', fontsize = 10)
plt.show()
In [ ]:
from sklearn.model_selection import GridSearchCV
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=1,
    seed=42
)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 4,
    cv = 10,
    verbose=3
)
grid_search.fit(x_train, y_train)
Fitting 10 folds for each of 96 candidates, totalling 960 fits
Out[ ]:
GridSearchCV(cv=10, estimator=XGBClassifier(nthread=1, seed=42), n_jobs=4,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': range(2, 10),
                         'n_estimators': range(60, 220, 40)},
             scoring='roc_auc', verbose=3)
In [ ]:
grid_search.best_estimator_
Out[ ]:
XGBClassifier(max_depth=9, n_estimators=180, nthread=1, seed=42)
In [ ]:
grid_search.best_score_
Out[ ]:
0.9943300460223539
In [ ]:
# xgb_tuned=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
#               gamma=0, gpu_id=0, importance_type=None,
#               interaction_constraints='', learning_rate=0.1, max_delta_step=0,
#               max_depth=9, min_child_weight=1,
#               monotone_constraints='()', n_estimators=160, n_jobs=1, nthread=1,
#               num_parallel_tree=1, predictor='auto', random_state=42,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=0.2, seed=42,
#               subsample=1, tree_method='exact', validate_parameters=1,
#               verbosity= 0)

# xgb_tuned.fit(x_train, y_train)
# xgb_tpred = xgb_tuned.predict(x_test)
# xgb_tacc = accuracy_score(xgb_tpred, y_test)
# xgb_tacc
In [ ]:
xgb_tuned = XGBClassifier(max_depth=9, n_estimators=180, nthread=1, seed=42)

xgb_tuned.fit(x_train, y_train)
xgb_tpred = xgb_tuned.predict(x_test)
xgb_tacc = accuracy_score(xgb_tpred, y_test)
xgb_tacc
Out[ ]:
0.9148727984344422
In [ ]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, plot_roc_curve, auc,classification_report
cm = confusion_matrix(y_test, xgb_tpred)
xgb_tprob = xgb_tuned.predict_proba(x_test)[:,1]
print(classification_report(y_test, xgb_tpred))
print('ROC AUC score: ',roc_auc_score(y_test, xgb_tprob))
print('Accuracy Score: ',accuracy_score(y_test, xgb_tpred))
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       960
           1       0.16      0.10      0.12        62

    accuracy                           0.91      1022
   macro avg       0.55      0.53      0.54      1022
weighted avg       0.90      0.91      0.90      1022

ROC AUC score:  0.7774025537634408
Accuracy Score:  0.9148727984344422
In [ ]:
plt.figure(figsize = (10, 6))
plt.title('Confusion Matrix', fontsize=14)
sns.heatmap(cm, cmap = 'Greens', fmt = 'd',annot = True,annot_kws = {'fontsize': 12}, 
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
plt.yticks(rotation = 0)
plt.show()