---
title: "Medical Cost Analysis"
description: "A statistical approach to predicting medical cost based on demographics and health conditions"
author: "Karl Marquez"
date: 2025-07-29
date-modified: 2025-08-01
image: thumbnail.JPG
categories:
- medical
- code
- clinical
format:
html:
code-fold: true
code-summary: "Show me the code"
code-overflow: wrap
code-tools: true
toc: true
toc-depth: 3
toc-expand: false
toc-title: Contents
toc-location: right
number-sections: true
css: "style.css"
smooth-scroll: true
---
```{r setup, include=FALSE}
knitr:: opts_chunk$ set (echo = TRUE , warning = FALSE , fig.align = "center" )
```
# Introduction
| The goal of this notebook is to characterize the dataset and ultimately help predict medical charges based on factors like
age, sex, bmi, number of children, smoking status, and region of living.
| Let us start with loading the dataset for this analysis.
```{r include=FALSE}
library (ggplot2)
library (dplyr)
library (tidyr)
library (lubridate) #to automatically guess the format of the date column
library (writexl)
library (corrplot)
library (Hmisc)
library (randomForest)
library (tidyverse)
library (readxl)
library (plotly)
library (ggforce)
library (ggstatsplot)
library (forcats)
library (ggpubr)
library (rstatix)
library (DataExplorer)
library (dlookr)
library (flextable)
library (prettydoc)
library (DT)
library (networkD3)
library (ggrepel)
library (leaflet)
library (maps)
library (ggcorrplot)
library (lares)
library (lmtest)
library (ggsignif)
```
```{r include=FALSE}
karl_theme <- theme_minimal () +
theme (plot.title = element_text (size= 20 , face= "bold" , family = "serif" ),
axis.title = element_text (size = 15 , face= "bold" , family = "serif" ),
axis.text = element_text (size = 10 , face= "bold" , family = "serif" ),
legend.title = element_text (size= 12 ),
legend.text = element_text (size= 12 , family = "serif" ),
panel.border = element_rect (colour = "gray" , fill = NA , linetype = 2 ),
panel.grid.major.x = element_line (colour = "darkgray" , linetype = 2 , linewidth = 0.5 ),
panel.grid.minor.x = element_blank (),
panel.grid.major.y = element_line (colour = "darkgray" , linetype = 2 , linewidth = 0.5 ),
panel.grid.minor.y = element_blank (),
panel.spacing = unit (5 , "lines" ))
```
```{r warning=FALSE}
medicalcost <- read.csv ("medical_cost.csv" )
medicalcost <- medicalcost %>%
mutate (BMI.status = case_when (
bmi < 18.5 ~ "Underweight" ,
bmi >= 18.5 & bmi < 24.9 ~ "Normal" ,
bmi >= 25 & bmi < 29.9 ~ "Overweight" ,
bmi >= 30 ~ "Obese" ,
TRUE ~ "Unknown" )) %>%
mutate (BMI.status = factor (BMI.status, ordered = TRUE ,
levels = c ("Underweight" , "Normal" , "Overweight" , "Obese" , "Unknown" ))) %>%
mutate (children = factor (children,
ordered = TRUE ,
levels = c ("0" , "1" , "2" , "3" , "4" , "5" ))) %>%
mutate (sex = factor (sex, ordered = TRUE , levels = c ("male" , "female" ))) %>%
mutate (smoker = factor (smoker, ordered = TRUE , levels = c ("no" , "yes" ))) %>%
mutate (region = factor (region, ordered = TRUE , levels = c ("northeast" , "northwest" , "southeast" , "southwest" )))
```
I added a variable to stratify the BMI into 4 different categories: Underweight, Normal, Overweight, and Obese. I also transformed the sex, bmi status, children, smoking status, region into categorical factors.
< br > < br >
# Exploratory Analysis
## Distribution and Univariate analysis {.tabset .tabset-pills .tabset-fade}
### Univariate Analysis1
```{r fig.cap= "BMI is the only normally distributed variable. Medical charges are skewed to the right.", fig.align="center"}
age.median <- median (medicalcost$ age, na.rm = TRUE )
age.mean <- mean (medicalcost$ age, na.rm = TRUE )
age.normality <- shapiro.test (medicalcost$ age)
age.distribution <- ggplotly (ggplot (data = medicalcost, aes (x= age)) +
geom_histogram (aes (y = ..density..), fill= "navy" , color= "black" , bins = 10 , alpha = 0.7 ) +
geom_density (color = "#B31312" , size = 1 , linetype = 2 ) +
geom_vline (xintercept = age.median, color= "#054B71" , linetype= "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 30 , y= 0.03 , color= "#054B71" , size = 3 ,
label= paste ("Median \n " , round (age.median, 1 ))) +
geom_vline (xintercept = age.mean, color= "#0086BD" , linetype = "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 50 , y= 0.03 , color= "#0086BD" , size= 3 ,
label= paste ("Mean \n " , round (age.mean, 1 ))) +
annotate (geom = "text" , x= 30 , y= 0.035 , size= 3 ,
label= paste ("Shapiro-Wilk \n Normality Test =" , round (age.normality$ p.value, 10 ))) +
labs (x= "Age" , y= "Count" , title= "Age Distribution" ) +
karl_theme)
### BMI distribution
bmi.median <- median (medicalcost$ bmi, na.rm = TRUE )
bmi.mean <- mean (medicalcost$ bmi, na.rm = TRUE )
bmi.normality <- shapiro.test (medicalcost$ bmi)
bmi.distribution <- ggplotly (ggplot (data = medicalcost, aes (x= bmi)) +
geom_histogram (aes (y = ..density..), fill= "navy" , color= "black" , bins = 10 , alpha = 0.7 ) +
geom_density (color = "#B31312" , size = 1 , linetype = 2 ) +
geom_vline (xintercept = bmi.median, color= "#054B71" , linetype= "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 45 , y= 0.03 , color= "#054B71" , size = 3 ,
label= paste ("Median \n " , round (bmi.median, 1 ))) +
geom_vline (xintercept = bmi.mean, color= "#0086BD" , linetype = "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 45 , y= 0.04 , color= "#0086BD" , size= 3 ,
label= paste ("Mean \n " , round (bmi.mean, 1 ))) +
annotate (geom = "text" , x= 45 , y= 0.06 , size= 3 ,
label= paste ("Shapiro-Wilk \n Normality Test =" , round (bmi.normality$ p.value, 10 ))) +
labs (x= "BMI" , y= "Count" , title= "BMI Distribution" ) +
karl_theme)
### charges distribution
charges.median <- median (medicalcost$ charges, na.rm = TRUE )
charges.mean <- mean (medicalcost$ charges, na.rm = TRUE )
charges.normality <- shapiro.test (medicalcost$ charges)
charges.distribution <- ggplotly (ggplot (data = medicalcost, aes (x= charges)) +
geom_histogram (aes (y = ..density..), fill= "navy" , color= "black" , bins = 15 , alpha = 0.7 ) +
geom_density (color = "#B31312" , size = 1 , linetype = 2 ) +
geom_vline (xintercept = charges.median, color= "#054B71" , linetype= "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 30000 , y= 0.00002 , color= "#054B71" , size = 3 ,
label= paste ("Median \n " , round (charges.median, 1 ))) +
geom_vline (xintercept = charges.mean, color= "#0086BD" , linetype = "dashed" , linewidth= 1 ) +
annotate (geom= "text" , x= 30000 , y= 0.00003 , color= "#0086BD" , size= 3 ,
label= paste ("Mean \n " , round (charges.mean, 1 ))) +
annotate (geom = "text" , x= 30000 , y= 0.00004 , size= 3 ,
label= paste ("Shapiro-Wilk \n Normality Test =" , round (charges.normality$ p.value, 10 ))) +
labs (x= "Charges ($)" , y= "Count" , title= "Charges ($) Distribution" ) +
scale_x_continuous (breaks = c (10000 , 20000 , 30000 , 40000 , 50000 , 60000 ),
labels = c ("$10k" , "$20k" , "$30k" , "$40k" , "$50k" , "$60k" )) +
karl_theme)
BMIstatus <- medicalcost %>%
filter (BMI.status != "Unknown" )
BMIstatus <- ggplotly (
ggplot (data = BMIstatus, aes (x= BMI.status)) +
geom_bar (fill = "navy" , color = "black" , alpha = 0.7 ) +
labs (x = "BMI status" ,
y = "Count" ) +
scale_x_discrete (breaks = c ("Underweight" , "Normal" , "Overweight" , "Obese" ),
labels = c ("Under \n Weight" , "Normal" , "Under \n Weight" , "Obese" )) +
karl_theme)
distribution <- subplot (
age.distribution, bmi.distribution, charges.distribution, BMIstatus, nrows = 2 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.12
)
distribution <- layout (distribution,
title = list (
text = "Age, BMI, and Medical Costs Distribution" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )),
margin = list (t = 100 ))
distribution
```
### Univariate Analysis2
```{r warning=FALSE}
children <- ggplotly (
ggplot (data = medicalcost, aes (x = children)) +
geom_bar (fill= "navy" , alpha = 0.7 , color = "black" ) +
labs (x= "Number of Children" , y= "Count" ,
title= " Number of Children" ) +
karl_theme)
### Sex
sex <- ggplotly (
ggplot (data = medicalcost, aes (x = sex)) +
geom_bar (fill= "navy" , alpha = 0.7 , color = "black" ) +
labs (x= "Sex" , y= "Count" ,
title= "Sex" ) +
karl_theme)
### region
region <- ggplotly (
ggplot (data = medicalcost, aes (x = region)) +
geom_bar (fill= "navy" , alpha = 0.7 , color = "black" ) +
labs (x= "Region" , y= "Count" ,
title= "Region" ) +
scale_x_discrete (breaks = c ("northeast" , "northwest" , "southeast" , "southwest" ),
labels = c ("North \n East" , "North \n West" , "South \n East" , "South \n West" )) +
karl_theme)
### smoking status
smoker <- ggplotly (
ggplot (data = medicalcost, aes (x = smoker)) +
geom_bar (fill= "navy" , alpha = 0.7 , color = "black" ) +
labs (x= "Smoker" , y= "Count" ,
title= "Smoker" ) +
karl_theme)
categories <- subplot (
children, sex, region, smoker, nrows = 2 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.1
)
categories <- layout (categories,
title = list (
text = "Sex, Smoking Status, Number of Children, and Region" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )))
categories
```
< br >
## Continuous vs Categorical Variables {.tabset .tabset-pills .tabset-fade}
### Age distribution
```{r warning=FALSE}
age.sex.comparison <- list (c ("male" , "female" ))
age.sex.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (sex, - age, FUN = median), y= age)) +
geom_violin (aes (fill = sex), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = sex), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = sex), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Sex" , y= "Age" , title = "Age Distribution by Sex" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
#stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
stat_compare_means (comparisons = age.sex.comparison, method = "t.test" ) +
stat_compare_means (method = "t.test" , label.y = 65 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ))
age.smoker.comparison <- list (c ("no" , "yes" ))
age.smoker.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (smoker, - age, FUN = median), y= age)) +
geom_violin (aes (fill = smoker), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = smoker), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = smoker), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Smoker" , y= "Age" , title = "Age Distribution by Smoking Status" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means (method = "t.test" , label.y = 62.5 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ))
age.region.comparison <- list (c ("northeast" , "northwest" ), c ("northeast" , "southeast" ), c ("northeast" , "southwest" ),
c ("northwest" , "southeast" ), c ("northwest" , "southwest" ),
c ("southeast" , "southwest" ))
age.region.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (region, - age, FUN = median), y= age)) +
geom_violin (aes (fill = region), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = region), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = region), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Region" , y= "Age" , title = "Age Distribution by Region" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 60 , label.x = 2 ) +
karl_theme +
theme (legend.position = "none" ))
age.children.comparison <- list (c ("0" , "1" ), c ("0" , "2" ), c ("0" , "3" ), c ("0" , "4" ), c ("0" , "5" ),
c ("1" , "2" ), c ("1" , "3" ), c ("1" , "4" ), c ("1" , "5" ),
c ("2" , "3" ), c ("2" , "4" ), c ("2" , "5" ),
c ("3" , "4" ), c ("3" , "5" ), c ("4" , "5" ))
age.children.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (children, - age, FUN = median), y= age)) +
geom_violin (aes (fill = children), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = children), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = children), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Children" , y= "Age" , title = "Age Distribution by Number of Children" ) +
scale_fill_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
scale_color_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 65 , label.x = 2 ) +
karl_theme +
theme (legend.position = "none" ))
age.categories <- subplot (
age.sex.plot, age.smoker.plot, age.region.plot, age.children.plot, nrows = 2 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.1
)
age.categories <- layout (age.categories,
title = list (
text = "Age Distribution among Categories" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )))
age.categories
```
### BMI distribution
```{r warning=FALSE}
bmi.sex.comparison <- list (c ("male" , "female" ))
bmi.sex.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (sex, - bmi, FUN = median), y= bmi)) +
geom_violin (aes (fill = sex), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = sex), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = sex), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Sex" , y= "BMI" , title = "BMI Distribution by Sex" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
stat_compare_means (method = "t.test" , label.y = 50 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ))
bmi.smoker.comparison <- list (c ("no" , "yes" ))
bmi.smoker.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (smoker, - bmi, FUN = median), y= bmi)) +
geom_violin (aes (fill = smoker), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = smoker), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = smoker), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Smoker" , y= "BMI" , title = "BMI Distribution by Smoking Status" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means (method = "t.test" , label.y = 50 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ))
bmi.region.comparison <- list (c ("northeast" , "northwest" ), c ("northeast" , "southeast" ), c ("northeast" , "southwest" ),
c ("northwest" , "southeast" ), c ("northwest" , "southwest" ),
c ("southeast" , "southwest" ))
bmi.region.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (region, - bmi, FUN = median), y= bmi)) +
geom_violin (aes (fill = region), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = region), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = region), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Region" , y= "BMI" , title = "BMI Distribution by Region" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 50 , label.x = 2 ) +
karl_theme +
theme (legend.position = "none" ))
bmi.children.comparison <- list (c ("0" , "1" ), c ("0" , "2" ), c ("0" , "3" ), c ("0" , "4" ), c ("0" , "5" ),
c ("1" , "2" ), c ("1" , "3" ), c ("1" , "4" ), c ("1" , "5" ),
c ("2" , "3" ), c ("2" , "4" ), c ("2" , "5" ),
c ("3" , "4" ), c ("3" , "5" ), c ("4" , "5" ))
bmi.children.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (children, - bmi, FUN = median), y= bmi)) +
geom_violin (aes (fill = children), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = children), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = children), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Children" , y= "BMI" , title = "BMI Distribution by Number of Children" ) +
scale_fill_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
scale_color_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 50 , label.x = 3 ) +
karl_theme +
theme (legend.position = "none" ))
bmi.categories <- subplot (
bmi.sex.plot, bmi.smoker.plot, bmi.region.plot, bmi.children.plot, nrows = 2 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.1
)
bmi.categories <- layout (bmi.categories,
title = list (
text = "BMI Distribution among Categories" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )))
bmi.categories
```
### Medical Cost distribution
```{r warning=FALSE}
charges.sex.comparison <- list (c ("male" , "female" ))
charges.sex.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (sex, - charges, FUN = median), y= charges)) +
geom_violin (aes (fill = sex), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = sex), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = sex), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Sex" , y= "Charges ($)" , title = "Charges Distribution by Sex" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
stat_compare_means (method = "t.test" , label.y = 60000 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )))
charges.smoker.comparison <- list (c ("no" , "yes" ))
charges.smoker.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (smoker, - charges, FUN = median), y= charges)) +
geom_violin (aes (fill = smoker), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = smoker), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = smoker), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Smoker" , y= "Charges ($)" , title = "Charges Distribution by Smoking Status" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means (method = "t.test" , label.y = 60000 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )))
charges.region.comparison <- list (c ("northeast" , "northwest" ), c ("northeast" , "southeast" ), c ("northeast" , "southwest" ),
c ("northwest" , "southeast" ), c ("northwest" , "southwest" ),
c ("southeast" , "southwest" ))
charges.region.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (region, - charges, FUN = median), y= charges)) +
geom_violin (aes (fill = region), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = region), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = region), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Region" , y= "Charges ($)" , title = "Charges Distribution by Region" ) +
scale_fill_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
scale_color_manual (values = c ("#1D24CA" , "#B31312" , "#F7C04A" , "#539165" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 60000 , label.x = 2.5 ) +
karl_theme +
theme (legend.position = "none" ) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )))
charges.children.comparison <- list (c ("0" , "1" ), c ("0" , "2" ), c ("0" , "3" ), c ("0" , "4" ), c ("0" , "5" ),
c ("1" , "2" ), c ("1" , "3" ), c ("1" , "4" ), c ("1" , "5" ),
c ("2" , "3" ), c ("2" , "4" ), c ("2" , "5" ),
c ("3" , "4" ), c ("3" , "5" ), c ("4" , "5" ))
charges.children.plot <- ggplotly (ggplot (data = medicalcost, aes (x = reorder (children, - charges, FUN = median), y= charges)) +
geom_violin (aes (fill = children), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = children), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = children), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "Children" , y= "Charges ($)" , title = "Charges Distribution by Number of Children" ) +
scale_fill_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
scale_color_manual (values = c ("#1D24CA" , "#005EF9" , "#0083FF" , "#00A0F6" , "#00B9D0" , "#00D0A2" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means (method = "anova" , label.y = 60000 , label.x = 2.5 ) +
karl_theme +
theme (legend.position = "none" ) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )))
charges.categories <- subplot (
charges.sex.plot, charges.smoker.plot, charges.region.plot, charges.children.plot, nrows = 2 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.1
)
charges.categories <- layout (charges.categories,
title = list (
text = "Charges ($) Distribution among Categories" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )))
charges.categories
```
### Age and Medical Charge
```{r warning=FALSE}
age.BMIstatus.comparison <- list (c ("Underweight" , "Normal" ),
c ("Underweight" , "Overweight" ),
c ("Underweight" , "Obese" ),
c ("Normal" , "Overweight" ),
c ("Normal" , "Obese" ),
c ("Overweight" , "Obese" ))
age.bmi.status <- age.BMIstatus.plot <- ggplotly (ggplot (data = BMIstatus, aes (x = reorder (BMI.status, - age, FUN = median), y= age)) +
geom_violin (aes (fill = BMI.status), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = BMI.status), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = BMI.status), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "BMI status" , y= "Age" , title = "Age Distribution by BMI status" ) +
scale_fill_manual (values = c ("#1D24CA" , "#539165" , "#F7C04A" ,"#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#539165" , "#F7C04A" ,"#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.BMIstatus.comparison) +
stat_compare_means (method = "anova" , label.y = 62 , label.x = 2 ) +
karl_theme +
theme (legend.position = "none" ))
charge.bmistatus <- charges.BMIstatus.plot <- ggplotly (ggplot (data = BMIstatus, aes (x = reorder (BMI.status, - charges, FUN = median), y= charges)) +
geom_violin (aes (fill = BMI.status), alpha = 0.5 , color = "black" ) +
geom_boxplot (aes (fill = BMI.status), alpha = 0.75 , color = "black" ) +
geom_sina (aes (fill = BMI.status), alpha = 0.5 , size = 3 , color = "white" ) +
labs (x= "BMI status" , y= "Charges ($)" , title = "Charge Distribution by BMI status" ) +
scale_fill_manual (values = c ("#1D24CA" , "#539165" , "#F7C04A" ,"#B31312" )) +
scale_color_manual (values = c ("#1D24CA" , "#539165" , "#F7C04A" ,"#B31312" )) +
stat_summary (fun.data = mean_cl_boot, geom = "point" , color = "black" , size = 3 ) +
stat_summary (fun.data = mean_cl_boot, geom = "errorbar" , color = "black" , size = 1 , width = 0.1 ) +
#stat_compare_means(comparisons = age.BMIstatus.comparison) +
stat_compare_means (method = "anova" , label.y = 60000 , label.x = 1.5 ) +
karl_theme +
theme (legend.position = "none" ) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )))
BMIstatus <- subplot (
age.bmi.status, charge.bmistatus, nrows = 1 ,
shareX = FALSE , shareY = FALSE , titleX = TRUE , titleY = TRUE , margin = 0.075
)
BMIstatus <- layout (BMIstatus,
title = list (
text = "Age and Medical Charges among different BMI classification" ,
font = list (size = 25 , family= "serif" , face = "bold" , weight = "bold" )))
BMIstatus
```
< br >
## Continuous vs Continuous Variables {.tabset .tabset-pills .tabset-fade}
### Medical Cost vs Age by smoking status
```{r}
age.vs.charge.by.smoker <- ggplotly (ggplot (data = medicalcost, aes (x = age, y = charges)) +
geom_point (alpha = 0.7 , size = 5 , color = "white" , aes (fill = smoker)) +
labs (title = "Smoker among the highest charged medically" ,
x = "Age" , y = "Medical Cost" ) +
scale_fill_manual (values = c ("no" = "#1D24CA" ,"yes" = "#B31312" )) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )) +
karl_theme +
theme (legend.position.inside = c (0 , 0 )))
age.vs.charge.by.smoker
```
### Medical Cost vs Age by BMI status
```{r}
BMIstatus <- medicalcost %>%
filter (BMI.status != "Unknown" )
ggplotly (ggplot (data = BMIstatus, aes (x = age, y = charges)) +
geom_point (alpha = 0.75 , size = 3 , color = "white" , aes (fill = BMI.status)) +
labs (title = "Obese among the highest charged medically" ) +
scale_fill_manual (values = c ("Underweight" = "#1D24CA" ,
"Normal" = "#539165" ,
"Overweight" = "#F7C04A" ,
"Obese" = "#B31312" )) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )) +
facet_wrap (~ BMI.status) +
karl_theme)
```
### Medical Cost vs BMI by smoking status
```{r}
bmi.vs.charge.by.smoker <- ggplotly (ggplot (data = medicalcost, aes (x = bmi, y = charges)) +
geom_point (alpha = 0.75 , size = 5 , color = "white" , aes (fill = smoker)) +
labs (title = "Smoker among the highest medically charged" ,
x = "BMI" , y = "Charges ($)" ) +
scale_fill_manual (values = c ("no" = "#1D24CA" ,"yes" = "#B31312" )) +
scale_y_continuous (breaks = c (20000 , 40000 , 60000 ),
labels = c ("$20k" , "$40k" , "$60k" )) +
karl_theme)
bmi.vs.charge.by.smoker
```
# Medical Cost can be explained by Smoking Status and BMI
## 3D Plots
```{r fig.cap= "Double-positive for Smoker and Obese among the highest medically charged."}
BMIstatus.naremoved <- medicalcost %>%
filter (BMI.status != "Unknown" ) %>%
mutate (BMI.status = factor (BMI.status,
levels = c ("Underweight" , "Normal" ,
"Overweight" , "Obese" )))
age.bmi.charge2 <- plot_ly (
data = BMIstatus.naremoved, type = "scatter3d" , mode = "markers" ,
x = ~ age, y = ~ bmi, z = ~ charges,
color = ~ smoker, colors = c ("no" = "#1D24CA" , "yes" = "#B31312" ),
symbol = ~ BMI.status, symbols = c ("Underweight" = "circle" ,
"Normal" = "square" ,
"Overweight" = "diamond" ,
"Obese" = "cross" ),
marker = list (opacity = 0.5 ), width = 900 , height = 800 ) %>%
layout (
title = list (
text = "3D ScatterPlot of Age, BMI, and Charges" ,
font = list (size = 20 )),
scene = list (
xaxis = list (title = "Age" , titlefont = list (size = 10 )),
yaxis = list (title = "BMI" , titlefont = list (size = 10 )),
zaxis = list (title = "Charges ($)" , titlefont = list (size = 10 ))),
legend = list (title = list (text = "Smoker Status" , font = list (size = 10 ))),
margin = list (l = 10 , r = 10 , b = 10 , t = 40 ),
autosize = TRUE )
age.bmi.charge2
```