import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
df=pd.read_csv('/content/drive/MyDrive/data/healthcare-dataset-stroke-data.csv')
df.shape
(5110, 12)
df.head()
id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
df.describe()
id | age | hypertension | heart_disease | avg_glucose_level | bmi | stroke | |
---|---|---|---|---|---|---|---|
count | 5110.000000 | 5110.000000 | 5110.000000 | 5110.000000 | 5110.000000 | 4909.000000 | 5110.000000 |
mean | 36517.829354 | 43.226614 | 0.097456 | 0.054012 | 106.147677 | 28.893237 | 0.048728 |
std | 21161.721625 | 22.612647 | 0.296607 | 0.226063 | 45.283560 | 7.854067 | 0.215320 |
min | 67.000000 | 0.080000 | 0.000000 | 0.000000 | 55.120000 | 10.300000 | 0.000000 |
25% | 17741.250000 | 25.000000 | 0.000000 | 0.000000 | 77.245000 | 23.500000 | 0.000000 |
50% | 36932.000000 | 45.000000 | 0.000000 | 0.000000 | 91.885000 | 28.100000 | 0.000000 |
75% | 54682.000000 | 61.000000 | 0.000000 | 0.000000 | 114.090000 | 33.100000 | 0.000000 |
max | 72940.000000 | 82.000000 | 1.000000 | 1.000000 | 271.740000 | 97.600000 | 1.000000 |
df.columns
Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'], dtype='object')
df.columns=df.columns.str.lower()
df.isna().sum()
id 0 gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 residence_type 0 avg_glucose_level 0 bmi 201 smoking_status 0 stroke 0 dtype: int64
df=df.fillna(np.mean(df['bmi']))
df.isna().sum()
id 0 gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 residence_type 0 avg_glucose_level 0 bmi 0 smoking_status 0 stroke 0 dtype: int64
df['smoking_status'].replace('Unknown', df['smoking_status'].mode()[0], inplace=True)
df.drop('id', axis=1, inplace=True)
df = df[df['gender'] != 'Other']
numeric_data=df.loc[:,df.nunique() > 5]
cols = [col for col in df.columns if col not in numeric_data]
categorical_data=df[cols].drop('stroke',axis=1)
numeric_data=pd.DataFrame(numeric_data)
categorical_data=pd.DataFrame(categorical_data)
print(f'Numerical data: {list(numeric_data)}')
print(f'Categorical data: {list(categorical_data)}')
Numerical data: ['age', 'avg_glucose_level', 'bmi'] Categorical data: ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'smoking_status']
sns.heatmap(df.corr(), cmap = 'Greens',annot=True);
plt.figure(figsize=(10,6))
ax=sns.countplot(x='smoking_status',data=df, palette='rainbow',hue='stroke')
plt.title("Count of people in each Smoking Group, Separated by Stroke")
for p in ax.patches:
ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+50))
g = sns.catplot(x='work_type',y='stroke', col = 'residence_type', data=df, kind='bar', palette='magma', saturation =2.5)
(g.set_axis_labels("Work Type", "Stroke Rate").set_titles("{col_name}").set(ylim=(0,0.15)))
g.fig.set_figwidth(10)
g.fig.set_figheight(2)
num_cols=numeric_data.columns.to_list()
sc = StandardScaler()
df[num_cols] = sc.fit_transform(df[num_cols])
df.head()
gender | age | hypertension | heart_disease | ever_married | work_type | residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Male | 1.051242 | 0 | 1 | Yes | Private | Urban | 2.706450 | 1.001041 | formerly smoked | 1 |
1 | Female | 0.785889 | 0 | 0 | Yes | Self-employed | Rural | 2.121652 | -0.000165 | never smoked | 1 |
2 | Male | 1.626174 | 0 | 1 | Yes | Private | Rural | -0.004867 | 0.468399 | never smoked | 1 |
3 | Female | 0.255182 | 0 | 0 | Yes | Private | Urban | 1.437473 | 0.715233 | smokes | 1 |
4 | Female | 1.581949 | 1 | 0 | Yes | Self-employed | Rural | 1.501297 | -0.635858 | never smoked | 1 |
le = LabelEncoder()
object_col = [col for col in df.columns if df[col].dtype == 'object']
for col in object_col:
df[col] = le.fit_transform(df[col])
df.head()
gender | age | hypertension | heart_disease | ever_married | work_type | residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.051242 | 0 | 1 | 1 | 2 | 1 | 2.706450 | 1.001041 | 0 | 1 |
1 | 0 | 0.785889 | 0 | 0 | 1 | 3 | 0 | 2.121652 | -0.000165 | 1 | 1 |
2 | 1 | 1.626174 | 0 | 1 | 1 | 2 | 0 | -0.004867 | 0.468399 | 1 | 1 |
3 | 0 | 0.255182 | 0 | 0 | 1 | 2 | 1 | 1.437473 | 0.715233 | 2 | 1 |
4 | 0 | 1.581949 | 1 | 0 | 1 | 3 | 0 | 1.501297 | -0.635858 | 1 | 1 |
training_data=df.copy()
x= training_data.drop(['stroke'],axis=1)
y= df['stroke']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train,y_train)
4. Modelling
lr= LogisticRegression()
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(lr_pred, y_test)
lr_f1 = f1_score(lr_pred, y_test)
lr_acc
0.7553816046966731
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train,y_train)
dt_pred = decision_tree.predict(x_test)
dt_acc = accuracy_score(dt_pred, y_test)
dt_acc
0.8522504892367906
rf = RandomForestClassifier(n_estimators = 25)
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
rf_acc = accuracy_score(rf_pred, y_test)
rf_acc
0.8718199608610567
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
xgb_acc = accuracy_score(xgb_pred, y_test)
xgb_acc
0.7818003913894325
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_acc = accuracy_score(knn_pred, y_test)
knn_acc
0.7720156555772995
lgbm = LGBMClassifier(random_state = 42)
lgbm.fit(x_train, y_train)
lgbm_pred = lgbm.predict(x_test)
lgbm_acc = accuracy_score(lgbm_pred, y_test)
lgbm_acc
0.9031311154598826
svm=SVC(random_state=42)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)
svm_acc = accuracy_score(svm_pred, y_test)
svm_acc
0.7563600782778865
models_names = ["LogisticRegression",'DecisionTreeClassifier','RandomForestClassifier','XGBClassifier',
'KNeighborsClassifier','LGBMClassifier','SVC']
models_acc=[lr_acc,dt_acc,rf_acc,xgb_acc,knn_acc,lgbm_acc,svm_acc]
plt.rcParams['figure.figsize']=12,6
ax = sns.barplot(x=models_names, y=models_acc, palette = "mako", saturation =1.5)
plt.xlabel('Classifier Models' )
plt.ylabel('Accuracy')
plt.title('Accuracy of different Classifier Models')
plt.xticks(fontsize = 10, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 10)
for i in ax.patches:
width, height = i.get_width(), i.get_height()
x, y = i.get_xy()
ax.annotate(f'{round(height,5)}', (x + width/2, y + height*1.02), ha='center', fontsize = 10)
plt.show()
from sklearn.model_selection import GridSearchCV
estimator = XGBClassifier(
objective= 'binary:logistic',
nthread=1,
seed=42
)
parameters = {
'max_depth': range (2, 10, 1),
'n_estimators': range(60, 220, 40),
'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 4,
cv = 10,
verbose=3
)
grid_search.fit(x_train, y_train)
Fitting 10 folds for each of 96 candidates, totalling 960 fits
GridSearchCV(cv=10, estimator=XGBClassifier(nthread=1, seed=42), n_jobs=4, param_grid={'learning_rate': [0.1, 0.01, 0.05], 'max_depth': range(2, 10), 'n_estimators': range(60, 220, 40)}, scoring='roc_auc', verbose=3)
grid_search.best_estimator_
XGBClassifier(max_depth=9, n_estimators=180, nthread=1, seed=42)
grid_search.best_score_
0.9943300460223539
# xgb_tuned=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
# colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
# gamma=0, gpu_id=0, importance_type=None,
# interaction_constraints='', learning_rate=0.1, max_delta_step=0,
# max_depth=9, min_child_weight=1,
# monotone_constraints='()', n_estimators=160, n_jobs=1, nthread=1,
# num_parallel_tree=1, predictor='auto', random_state=42,
# reg_alpha=0, reg_lambda=1, scale_pos_weight=0.2, seed=42,
# subsample=1, tree_method='exact', validate_parameters=1,
# verbosity= 0)
# xgb_tuned.fit(x_train, y_train)
# xgb_tpred = xgb_tuned.predict(x_test)
# xgb_tacc = accuracy_score(xgb_tpred, y_test)
# xgb_tacc
xgb_tuned = XGBClassifier(max_depth=9, n_estimators=180, nthread=1, seed=42)
xgb_tuned.fit(x_train, y_train)
xgb_tpred = xgb_tuned.predict(x_test)
xgb_tacc = accuracy_score(xgb_tpred, y_test)
xgb_tacc
0.9148727984344422
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, plot_roc_curve, auc,classification_report
cm = confusion_matrix(y_test, xgb_tpred)
xgb_tprob = xgb_tuned.predict_proba(x_test)[:,1]
print(classification_report(y_test, xgb_tpred))
print('ROC AUC score: ',roc_auc_score(y_test, xgb_tprob))
print('Accuracy Score: ',accuracy_score(y_test, xgb_tpred))
precision recall f1-score support 0 0.94 0.97 0.96 960 1 0.16 0.10 0.12 62 accuracy 0.91 1022 macro avg 0.55 0.53 0.54 1022 weighted avg 0.90 0.91 0.90 1022 ROC AUC score: 0.7774025537634408 Accuracy Score: 0.9148727984344422
plt.figure(figsize = (10, 6))
plt.title('Confusion Matrix', fontsize=14)
sns.heatmap(cm, cmap = 'Greens', fmt = 'd',annot = True,annot_kws = {'fontsize': 12},
yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
plt.yticks(rotation = 0)
plt.show()