library(tidyverse) # Collection of R packages for data science
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(naniar) # Data structures and functions for plotting of missing values
library(caTools) # Several basic utility functions
library(ggplot2) # Data visualisations Using the Grammar of Graphics
library(superheat) # Generating customizable heatmaps
library(scatterplot3d) # Plots a three dimensional point cloud
library(ROCR) # Creating cutoff-parameterized 2D performance curves
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths. This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.
The dataset can be found in the repository or can be downloaded from Kaggle
data = read.csv("stroke_data.csv")
str(data)
## 'data.frame': 5110 obs. of 10 variables:
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : chr "36.6" "N/A" "32.5" "34.4" ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
glimpse(data)
## Rows: 5,110
## Columns: 10
## $ gender <chr> "Male", "Female", "Male", "Female", "Female", "Male"~
## $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, ~
## $ hypertension <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1~
## $ heart_disease <int> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0~
## $ ever_married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No~
## $ Residence_type <chr> "Urban", "Rural", "Rural", "Urban", "Rural", "Urban"~
## $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0~
## $ bmi <chr> "36.6", "N/A", "32.5", "34.4", "24", "29", "27.4", "~
## $ smoking_status <chr> "formerly smoked", "never smoked", "never smoked", "~
## $ stroke <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
Note: “Unknown” in smoking_status means that the information is unavailable for this patient
unique(data $ gender)
## [1] "Male" "Female" "Other"
unique(data $ ever_married)
## [1] "Yes" "No"
unique(data $ Residence_type)
## [1] "Urban" "Rural"
unique(data $ smoking_status)
## [1] "formerly smoked" "never smoked" "smokes" "Unknown"
As seen in the above values, the character values can be converted into numeric values.
clean_data <- data %>% mutate(gender = if_else(gender == "Female", 0, if_else(gender == "Male", 1, 2)), ever_married = if_else(ever_married == "Yes", 1, 0), Residence_type = if_else(Residence_type == "Rural", 0, 1), smoking_status = if_else(smoking_status == "never smoked", 0, if_else(smoking_status == "formerly smoked", 1, if_else(smoking_status == "smokes", 2, 3))))
summary(clean_data)
## gender age hypertension heart_disease
## Min. :0.0000 Min. : 0.08 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:25.00 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :45.00 Median :0.00000 Median :0.00000
## Mean :0.4143 Mean :43.23 Mean :0.09746 Mean :0.05401
## 3rd Qu.:1.0000 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :2.0000 Max. :82.00 Max. :1.00000 Max. :1.00000
## ever_married Residence_type avg_glucose_level bmi
## Min. :0.0000 Min. :0.000 Min. : 55.12 Length:5110
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 77.25 Class :character
## Median :1.0000 Median :1.000 Median : 91.89 Mode :character
## Mean :0.6562 Mean :0.508 Mean :106.15
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:114.09
## Max. :1.0000 Max. :1.000 Max. :271.74
## smoking_status stroke
## Min. :0.000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000
## Median :1.000 Median :0.00000
## Mean :1.388 Mean :0.04873
## 3rd Qu.:3.000 3rd Qu.:0.00000
## Max. :3.000 Max. :1.00000
miss_scan_count(data = data, search = list("N/A", "Unknown"))
## # A tibble: 10 x 2
## Variable n
## <chr> <int>
## 1 gender 0
## 2 age 0
## 3 hypertension 0
## 4 heart_disease 0
## 5 ever_married 0
## 6 Residence_type 0
## 7 avg_glucose_level 0
## 8 bmi 201
## 9 smoking_status 1544
## 10 stroke 0
There are 201 “N/A” values in the bmi column that likely caused this column to be parsed as character, although it should be numerical. Let’s take care of that by replacing those values with actual NAs. Moreover, there are a lot of “Unknown” values in smoking_status which we have to take care of too. We see that we have 1544 unknown values for smoking status and therefore are missing a lot of information in a potentially informative predictor. We will have to deal with this. Lets replace those values with NAs.
clean_data <- replace_with_na(data = clean_data, replace = list(bmi = c("N/A"), smoking_status = c(3))) %>% mutate(bmi = as.numeric(bmi))
summary(clean_data)
## gender age hypertension heart_disease
## Min. :0.0000 Min. : 0.08 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:25.00 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :45.00 Median :0.00000 Median :0.00000
## Mean :0.4143 Mean :43.23 Mean :0.09746 Mean :0.05401
## 3rd Qu.:1.0000 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :2.0000 Max. :82.00 Max. :1.00000 Max. :1.00000
##
## ever_married Residence_type avg_glucose_level bmi
## Min. :0.0000 Min. :0.000 Min. : 55.12 Min. :10.30
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 77.25 1st Qu.:23.50
## Median :1.0000 Median :1.000 Median : 91.89 Median :28.10
## Mean :0.6562 Mean :0.508 Mean :106.15 Mean :28.89
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:114.09 3rd Qu.:33.10
## Max. :1.0000 Max. :1.000 Max. :271.74 Max. :97.60
## NA's :201
## smoking_status stroke
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.6907 Mean :0.04873
## 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :2.0000 Max. :1.00000
## NA's :1544
superheat(subset(clean_data, select = -c(stroke)), scale = TRUE, bottom.label.size = 0.5, bottom.label.text.angle = 90, bottom.label.text.size = 3)
We see that the missingness doesn’t show clear association with other variables and therefore we can assume this missingness is MCAR (missing completely at random). The distribution is right skewed (long tail to the right) as this is the only variable with missing data (at least of the numerical variables).
ggplot(clean_data, aes(x = bmi)) + geom_density(color="black", fill="lightblue") + labs(title = "Distribution of BMI")
## Warning: Removed 201 rows containing non-finite values (stat_density).
ggplot(data, aes(x = factor(gender), fill = factor(gender))) + geom_bar() + theme_classic()
ggplot(clean_data, aes(x = age, y = bmi, color = stroke)) + geom_point() + scale_color_gradient(low = "lightblue", high = "red")
## Warning: Removed 201 rows containing missing values (geom_point).
ggplot(clean_data, aes(x = stroke, y = avg_glucose_level, group = stroke, fill = stroke)) + geom_boxplot()
set.seed(99) # Set a seed for reproducible results
split = sample.split(clean_data $ stroke, SplitRatio = 0.7)
train = subset(clean_data, split == TRUE)
test = subset(clean_data, split == FALSE)
logistic_regression_1 = glm(stroke~., data = train, family = 'binomial')
summary(logistic_regression_1)
##
## Call:
## glm(formula = stroke ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1462 -0.3419 -0.1978 -0.1186 3.0953
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.648670 0.754584 -10.136 < 2e-16 ***
## gender -0.197661 0.201751 -0.980 0.32722
## age 0.066603 0.007607 8.755 < 2e-16 ***
## hypertension 0.691216 0.212789 3.248 0.00116 **
## heart_disease 0.349765 0.273365 1.279 0.20073
## ever_married -0.401483 0.287007 -1.399 0.16185
## Residence_type -0.053097 0.193027 -0.275 0.78326
## avg_glucose_level 0.004067 0.001664 2.443 0.01455 *
## bmi 0.014647 0.014746 0.993 0.32058
## smoking_status 0.254129 0.125064 2.032 0.04215 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 992.58 on 2389 degrees of freedom
## Residual deviance: 815.57 on 2380 degrees of freedom
## (1187 observations deleted due to missingness)
## AIC: 835.57
##
## Number of Fisher Scoring iterations: 7
A lot of variables are not significant. Hence we will be removing Variables based on significance level. The least significant variable as seen is Residence_type with a Pr-value of 0.78326. Hence we will remove Residence_type.
logistic_regression_2 = glm(stroke ~ gender + age + hypertension + heart_disease + ever_married + avg_glucose_level + bmi + smoking_status, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ gender + age + hypertension + heart_disease +
## ever_married + avg_glucose_level + bmi + smoking_status,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1348 -0.3413 -0.1978 -0.1186 3.0857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.670767 0.750364 -10.223 < 2e-16 ***
## gender -0.195520 0.201597 -0.970 0.33212
## age 0.066508 0.007594 8.758 < 2e-16 ***
## hypertension 0.692810 0.212713 3.257 0.00113 **
## heart_disease 0.350092 0.273423 1.280 0.20040
## ever_married -0.399230 0.286831 -1.392 0.16396
## avg_glucose_level 0.004052 0.001664 2.436 0.01485 *
## bmi 0.014700 0.014747 0.997 0.31886
## smoking_status 0.251534 0.124711 2.017 0.04370 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 992.58 on 2389 degrees of freedom
## Residual deviance: 815.65 on 2381 degrees of freedom
## (1187 observations deleted due to missingness)
## AIC: 833.65
##
## Number of Fisher Scoring iterations: 7
The least significant variable as seen is gender with a Pr-value of 0.33212. Hence we will remove gender.
logistic_regression_2 = glm(stroke ~ age + hypertension + heart_disease + ever_married + avg_glucose_level + bmi + smoking_status, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + ever_married +
## avg_glucose_level + bmi + smoking_status, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1760 -0.3427 -0.1969 -0.1183 3.1056
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.724040 0.750464 -10.292 < 2e-16 ***
## age 0.066570 0.007599 8.761 < 2e-16 ***
## hypertension 0.692631 0.212493 3.260 0.00112 **
## heart_disease 0.317594 0.270992 1.172 0.24121
## ever_married -0.413988 0.286149 -1.447 0.14797
## avg_glucose_level 0.003888 0.001651 2.354 0.01856 *
## bmi 0.015353 0.014791 1.038 0.29928
## smoking_status 0.234714 0.123694 1.898 0.05776 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 992.58 on 2389 degrees of freedom
## Residual deviance: 816.60 on 2382 degrees of freedom
## (1187 observations deleted due to missingness)
## AIC: 832.6
##
## Number of Fisher Scoring iterations: 7
The least significant variable as seen is bmi with a Pr-value of 0.29928. Hence we will remove bmi.
logistic_regression_2 = glm(stroke ~ age + hypertension + heart_disease + ever_married + avg_glucose_level + smoking_status, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + ever_married +
## avg_glucose_level + smoking_status, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1298 -0.3561 -0.2082 -0.1257 3.0848
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.166651 0.513513 -13.956 < 2e-16 ***
## age 0.066847 0.007042 9.493 < 2e-16 ***
## hypertension 0.575626 0.203317 2.831 0.00464 **
## heart_disease 0.198980 0.261232 0.762 0.44624
## ever_married -0.380014 0.276856 -1.373 0.16987
## avg_glucose_level 0.003918 0.001539 2.545 0.01092 *
## smoking_status 0.179974 0.117392 1.533 0.12525
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1077.58 on 2486 degrees of freedom
## Residual deviance: 893.96 on 2480 degrees of freedom
## (1090 observations deleted due to missingness)
## AIC: 907.96
##
## Number of Fisher Scoring iterations: 7
The least significant variable as seen is heart_disease with a Pr-value of 0.44624. Hence we will remove heart_disease.
logistic_regression_2 = glm(stroke ~ age + hypertension + ever_married + avg_glucose_level + smoking_status, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + ever_married + avg_glucose_level +
## smoking_status, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0774 -0.3597 -0.2084 -0.1251 3.0873
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.213448 0.510203 -14.138 < 2e-16 ***
## age 0.067823 0.006918 9.803 < 2e-16 ***
## hypertension 0.579361 0.203039 2.853 0.00432 **
## ever_married -0.398364 0.275464 -1.446 0.14813
## avg_glucose_level 0.004081 0.001524 2.678 0.00740 **
## smoking_status 0.186886 0.116936 1.598 0.11000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1077.58 on 2486 degrees of freedom
## Residual deviance: 894.53 on 2481 degrees of freedom
## (1090 observations deleted due to missingness)
## AIC: 906.53
##
## Number of Fisher Scoring iterations: 7
The least significant variable as seen is ever_married with a Pr-value of 0.14813. Hence we will remove ever_married.
logistic_regression_2 = glm(stroke ~ age + hypertension + avg_glucose_level + smoking_status, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + avg_glucose_level +
## smoking_status, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0285 -0.3647 -0.2113 -0.1189 3.0586
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.448500 0.497525 -14.971 < 2e-16 ***
## age 0.066389 0.006947 9.557 < 2e-16 ***
## hypertension 0.587793 0.202469 2.903 0.00369 **
## avg_glucose_level 0.003947 0.001516 2.604 0.00921 **
## smoking_status 0.177741 0.116688 1.523 0.12771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1077.58 on 2486 degrees of freedom
## Residual deviance: 896.49 on 2482 degrees of freedom
## (1090 observations deleted due to missingness)
## AIC: 906.49
##
## Number of Fisher Scoring iterations: 7
The least significant variable as seen is smoking_status with a Pr-value of 0.12771. Hence we will remove smoking_status.
logistic_regression_2 = glm(stroke ~ age + hypertension + avg_glucose_level, data = train, family = 'binomial')
summary(logistic_regression_2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + avg_glucose_level,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9796 -0.3283 -0.1788 -0.0880 3.7420
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.398490 0.412644 -17.929 < 2e-16 ***
## age 0.067642 0.005925 11.416 < 2e-16 ***
## hypertension 0.389762 0.191117 2.039 0.04141 *
## avg_glucose_level 0.004391 0.001377 3.188 0.00143 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1391.5 on 3576 degrees of freedom
## Residual deviance: 1125.8 on 3573 degrees of freedom
## AIC: 1133.8
##
## Number of Fisher Scoring iterations: 7
Hence we get the three most significant variables having Pr-values less than 0.05.
predict_train = predict(logistic_regression_2, type = 'response')
table(train $ stroke, predict_train>0.2)
##
## FALSE TRUE
## 0 3274 129
## 1 136 38
(3274 + 38) / nrow(train)
## [1] 0.9259156
predict_test = predict(logistic_regression_2, newdata = test, type = 'response')
table(test $ stroke, predict_test>0.2)
##
## FALSE TRUE
## 0 1414 44
## 1 57 18
(1414 + 18) / nrow(test)
## [1] 0.9341161
rocr_prediction = prediction(predict_test, test $ stroke)
auc = as.numeric(performance(rocr_prediction, 'auc') @ y.values)
auc
## [1] 0.8552629
rocr_performance = performance(rocr_prediction, 'tpr','fpr')
plot(rocr_performance, colorize = TRUE, main = 'ROCR Curve')
with(clean_data, {scatterplot3d(x = age, y = hypertension, z = avg_glucose_level, main = "Stroke Prediction Scatterplot", xlab = "Age", ylab = "Hypertension", zlab = "Average Glucose Level")})