Medical Cost Analysis

medical

code

clinical

A statistical approach to predicting medical cost based on demographics and health conditions

Author

Karl Marquez

Published

July 29, 2025

Modified

August 1, 2025

1 Introduction

The goal of this notebook is to characterize the dataset and ultimately help predict medical charges based on factors like

age, sex, bmi, number of children, smoking status, and region of living.

Let us start with loading the dataset for this analysis.

Show me the code

medicalcost <- read.csv("medical_cost.csv")
medicalcost <- medicalcost %>% 
  mutate(BMI.status = case_when(
    bmi < 18.5 ~ "Underweight",
    bmi >= 18.5 & bmi < 24.9 ~ "Normal",
    bmi >= 25 & bmi < 29.9 ~ "Overweight",
    bmi >= 30 ~ "Obese",
    TRUE ~ "Unknown")) %>%
  mutate(BMI.status = factor(BMI.status, ordered = TRUE,
                             levels = c("Underweight", "Normal", "Overweight", "Obese", "Unknown"))) %>% 
  mutate(children = factor(children, 
                           ordered = TRUE,
                           levels = c("0", "1", "2", "3", "4", "5"))) %>% 
  mutate(sex = factor(sex, ordered = TRUE, levels = c("male", "female"))) %>% 
  mutate(smoker = factor(smoker, ordered = TRUE, levels = c("no", "yes"))) %>% 
  mutate(region = factor(region, ordered = TRUE, levels = c("northeast", "northwest", "southeast", "southwest")))

I added a variable to stratify the BMI into 4 different categories: Underweight, Normal, Overweight, and Obese. I also transformed the sex, bmi status, children, smoking status, region into categorical factors.

2 Exploratory Analysis

2.1 Distribution and Univariate analysis

2.1.1 Univariate Analysis1

Show me the code

age.median <- median(medicalcost$age, na.rm = TRUE)
age.mean <- mean(medicalcost$age, na.rm = TRUE)
age.normality <- shapiro.test(medicalcost$age)
age.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=age)) +
                               geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
                               geom_density(color = "#B31312", size = 1, linetype = 2) +
                               geom_vline(xintercept = age.median, color="#054B71", linetype="dashed", linewidth=1) +
                               annotate(geom="text", x=30, y=0.03, color="#054B71", size = 3,
                                        label=paste("Median\n", round(age.median, 1))) +
                               geom_vline(xintercept = age.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                               annotate(geom="text", x=50, y=0.03, color="#0086BD", size=3,
                                        label=paste("Mean\n", round(age.mean, 1))) +
                               annotate(geom = "text", x=30, y=0.035, size=3,
                                        label=paste("Shapiro-Wilk\nNormality Test =", round(age.normality$p.value, 10))) +
                               labs(x="Age", y="Count", title="Age Distribution") +
                               karl_theme)
### BMI distribution
bmi.median <- median(medicalcost$bmi, na.rm = TRUE)
bmi.mean <- mean(medicalcost$bmi, na.rm = TRUE)
bmi.normality <- shapiro.test(medicalcost$bmi)
bmi.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=bmi)) +
                               geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
                               geom_density(color = "#B31312", size = 1, linetype = 2) +
                               geom_vline(xintercept = bmi.median, color="#054B71", linetype="dashed", linewidth=1) +
                               annotate(geom="text", x=45, y=0.03, color="#054B71", size = 3,
                                        label=paste("Median\n", round(bmi.median, 1))) +
                               geom_vline(xintercept = bmi.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                               annotate(geom="text", x=45, y=0.04, color="#0086BD", size=3,
                                        label=paste("Mean\n", round(bmi.mean, 1))) +
                               annotate(geom = "text", x=45, y=0.06, size=3,
                                        label=paste("Shapiro-Wilk\nNormality Test =", round(bmi.normality$p.value, 10))) +
                               labs(x="BMI", y="Count", title="BMI Distribution") +
                               karl_theme)
### charges distribution
charges.median <- median(medicalcost$charges, na.rm = TRUE)
charges.mean <- mean(medicalcost$charges, na.rm = TRUE)
charges.normality <- shapiro.test(medicalcost$charges)
charges.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=charges)) +
                                   geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 15, alpha = 0.7) +
                                   geom_density(color = "#B31312", size = 1, linetype = 2) +
                                   geom_vline(xintercept = charges.median, color="#054B71", linetype="dashed", linewidth=1) +
                                   annotate(geom="text", x=30000, y=0.00002, color="#054B71", size = 3,
                                            label=paste("Median\n", round(charges.median, 1))) +
                                   geom_vline(xintercept = charges.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                                   annotate(geom="text", x=30000, y=0.00003, color="#0086BD", size=3,
                                            label=paste("Mean\n", round(charges.mean, 1))) +
                                   annotate(geom = "text", x=30000, y=0.00004, size=3,
                                            label=paste("Shapiro-Wilk\nNormality Test =", round(charges.normality$p.value, 10))) +
                                   labs(x="Charges ($)", y="Count", title="Charges ($) Distribution") +
                                   scale_x_continuous(breaks = c(10000, 20000, 30000, 40000, 50000, 60000),
                                                      labels = c("$10k", "$20k", "$30k", "$40k", "$50k", "$60k")) +
                                   karl_theme)

BMIstatus <- medicalcost %>% 
  filter(BMI.status != "Unknown")
BMIstatus <- ggplotly(
  ggplot(data = BMIstatus, aes(x=BMI.status)) +
    geom_bar(fill = "navy", color = "black", alpha = 0.7) +
    labs(x = "BMI status",
         y = "Count") +
    scale_x_discrete(breaks = c("Underweight", "Normal", "Overweight", "Obese"),
                     labels = c("Under\nWeight", "Normal", "Under\nWeight", "Obese")) +
    karl_theme)



distribution <- subplot(
  age.distribution, bmi.distribution, charges.distribution, BMIstatus, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.12
)
distribution <- layout(distribution, 
                     title = list(
                       text = "Age, BMI, and Medical Costs Distribution",
                       font = list(size = 25, family= "serif", face = "bold", weight = "bold")),
                     margin = list(t = 100))
distribution

BMI is the only normally distributed variable. Medical charges are skewed to the right.

2.1.2 Univariate Analysis2

Show me the code

children <- ggplotly(
  ggplot(data = medicalcost, aes(x = children)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Number of Children", y="Count", 
         title=" Number of Children") +
    karl_theme)

### Sex
sex <- ggplotly(
  ggplot(data = medicalcost, aes(x = sex)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Sex", y="Count", 
         title="Sex") +
    karl_theme)


### region

region <- ggplotly(
  ggplot(data = medicalcost, aes(x = region)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Region", y="Count", 
         title="Region") +
    scale_x_discrete(breaks = c("northeast", "northwest", "southeast", "southwest"),
                       labels = c("North\nEast", "North\nWest", "South\nEast", "South\nWest")) +
    karl_theme)

### smoking status
smoker <- ggplotly(
  ggplot(data = medicalcost, aes(x = smoker)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Smoker", y="Count", 
         title="Smoker") +
    karl_theme)


categories <- subplot(
  children, sex, region, smoker, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
categories <- layout(categories, 
                       title = list(
                         text = "Sex, Smoking Status, Number of Children, and Region",
                         font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
categories

2.2 Continuous vs Categorical Variables

2.2.1 Age distribution

Show me the code

age.sex.comparison <- list(c("male", "female")) 
age.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -age, FUN = median), y=age)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="Age", title = "Age Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           #stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 65, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none"))


age.smoker.comparison <- list(c("no", "yes"))
age.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -age, FUN = median), y=age)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="Age", title = "Age Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 62.5, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none"))

age.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
age.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -age, FUN = median), y=age)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="Age", title = "Age Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 60, label.x = 2) +
                              karl_theme +
                              theme(legend.position = "none"))


age.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
age.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -age, FUN = median), y=age)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="Age", title = "Age Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 65, label.x = 2) +
                                karl_theme +
                                theme(legend.position = "none"))

age.categories <- subplot(
  age.sex.plot, age.smoker.plot, age.region.plot, age.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
age.categories <- layout(age.categories, 
                     title = list(
                       text = "Age Distribution among Categories",
                       font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
age.categories

2.2.2 BMI distribution

Show me the code

bmi.sex.comparison <- list(c("male", "female")) 
bmi.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -bmi, FUN = median), y=bmi)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="BMI", title = "BMI Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none"))

bmi.smoker.comparison <- list(c("no", "yes"))
bmi.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -bmi, FUN = median), y=bmi)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="BMI", title = "BMI Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none"))

bmi.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
bmi.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -bmi, FUN = median), y=bmi)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="BMI", title = "BMI Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 50, label.x = 2) +
                              karl_theme +
                              theme(legend.position = "none"))

bmi.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
bmi.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -bmi, FUN = median), y=bmi)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="BMI", title = "BMI Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 50, label.x = 3) +
                                karl_theme +
                                theme(legend.position = "none"))

bmi.categories <- subplot(
  bmi.sex.plot, bmi.smoker.plot, bmi.region.plot, bmi.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
bmi.categories <- layout(bmi.categories,
                         title = list(
                           text = "BMI Distribution among Categories",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
bmi.categories

2.2.3 Medical Cost distribution

Show me the code

charges.sex.comparison <- list(c("male", "female")) 
charges.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -charges, FUN = median), y=charges)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="Charges ($)", title = "Charges Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none") +
                             scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                labels = c("$20k", "$40k", "$60k")))

charges.smoker.comparison <- list(c("no", "yes"))
charges.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -charges, FUN = median), y=charges)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="Charges ($)", title = "Charges Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none") +
                                scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                   labels = c("$20k", "$40k", "$60k")))

charges.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
charges.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -charges, FUN = median), y=charges)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="Charges ($)", title = "Charges Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
                              karl_theme +
                              theme(legend.position = "none") +
                                scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                   labels = c("$20k", "$40k", "$60k")))

charges.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
charges.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -charges, FUN = median), y=charges)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="Charges ($)", title = "Charges Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
                                karl_theme +
                                theme(legend.position = "none") +
                                  scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                     labels = c("$20k", "$40k", "$60k")))
charges.categories <- subplot(
  charges.sex.plot, charges.smoker.plot, charges.region.plot, charges.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
charges.categories <- layout(charges.categories,
                         title = list(
                           text = "Charges ($) Distribution among Categories",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
charges.categories

2.2.4 Age and Medical Charge

Show me the code

age.BMIstatus.comparison <- list(c("Underweight", "Normal"),
                                 c("Underweight", "Overweight"),
                                 c("Underweight", "Obese"),
                                 c("Normal", "Overweight"),
                                 c("Normal", "Obese"),
                                 c("Overweight", "Obese"))
age.bmi.status <- age.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -age, FUN = median), y=age)) +
                                 geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
                                 geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
                                 geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
                                 labs(x="BMI status", y="Age", title = "Age Distribution by BMI status") +
                                 scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                 #stat_compare_means(comparisons = age.BMIstatus.comparison) +
                                 stat_compare_means(method = "anova", label.y = 62, label.x = 2) +
                                 karl_theme +
                                 theme(legend.position = "none"))

charge.bmistatus <- charges.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -charges, FUN = median), y=charges)) +
                                 geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
                                 geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
                                 geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
                                 labs(x="BMI status", y="Charges ($)", title = "Charge Distribution by BMI status") +
                                 scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                 #stat_compare_means(comparisons = age.BMIstatus.comparison) +
                                 stat_compare_means(method = "anova", label.y = 60000, label.x = 1.5) +
                                 karl_theme +
                                 theme(legend.position = "none") +
                                   scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                      labels = c("$20k", "$40k", "$60k")))
BMIstatus <- subplot(
  age.bmi.status, charge.bmistatus, nrows = 1,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.075
)
BMIstatus <- layout(BMIstatus,
                         title = list(
                           text = "Age and Medical Charges among different BMI classification",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
BMIstatus

2.3 Continuous vs Continuous Variables

2.3.1 Medical Cost vs Age by smoking status

Show me the code

age.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = age, y = charges)) +
                                      geom_point(alpha = 0.7, size = 5, color = "white", aes(fill = smoker)) +
                                      labs(title = "Smoker among the highest charged medically",
                                           x = "Age", y = "Medical Cost") +
                                      scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
                                      scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                         labels = c("$20k", "$40k", "$60k")) +
                                      karl_theme +
                                      theme(legend.position.inside = c(0, 0)))
age.vs.charge.by.smoker

2.3.2 Medical Cost vs Age by BMI status

Show me the code

BMIstatus <- medicalcost %>% 
  filter(BMI.status != "Unknown")
ggplotly(ggplot(data = BMIstatus, aes(x = age, y = charges)) +
           geom_point(alpha = 0.75, size = 3, color = "white", aes(fill = BMI.status)) +
           labs(title = "Obese among the highest charged medically") +
           scale_fill_manual(values = c("Underweight" = "#1D24CA", 
                                         "Normal" = "#539165", 
                                         "Overweight" = "#F7C04A",
                                         "Obese" = "#B31312")) +
           scale_y_continuous(breaks = c(20000, 40000, 60000),
                                    labels = c("$20k", "$40k", "$60k")) +
           facet_wrap(~ BMI.status) +
           karl_theme)

2.3.3 Medical Cost vs BMI by smoking status

Show me the code

bmi.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = bmi, y = charges)) +
                            geom_point(alpha = 0.75, size = 5, color = "white", aes(fill = smoker)) +
                            labs(title = "Smoker among the highest medically charged",
                                 x = "BMI", y = "Charges ($)") +
                            scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
                            scale_y_continuous(breaks = c(20000, 40000, 60000),
                                               labels = c("$20k", "$40k", "$60k")) +
                            karl_theme)
bmi.vs.charge.by.smoker

3 Medical Cost can be explained by Smoking Status and BMI

3.1 3D Plots

Show me the code

BMIstatus.naremoved <- medicalcost %>% 
  filter(BMI.status != "Unknown") %>% 
  mutate(BMI.status = factor(BMI.status,
                                levels = c("Underweight", "Normal",
                                           "Overweight", "Obese")))

age.bmi.charge2 <- plot_ly(
  data = BMIstatus.naremoved, type = "scatter3d", mode = "markers",
  x = ~age, y = ~bmi, z = ~charges, 
  color = ~smoker, colors = c("no" = "#1D24CA", "yes" = "#B31312"),
  symbol = ~BMI.status, symbols = c("Underweight" = "circle",
                                    "Normal" = "square",
                                    "Overweight" = "diamond",
                                    "Obese" = "cross"),
  marker = list(opacity = 0.5), width = 900, height = 800) %>% 
  layout(
    title = list(
      text = "3D ScatterPlot of Age, BMI, and Charges", 
      font = list(size = 20)),
    scene = list(
      xaxis = list(title = "Age", titlefont = list(size = 10)),
      yaxis = list(title = "BMI", titlefont = list(size = 10)),
      zaxis = list(title = "Charges ($)", titlefont = list(size = 10))),
    legend = list(title = list(text = "Smoker Status", font = list(size = 10))),
    margin = list(l = 10, r = 10, b = 10, t = 40), 
    autosize = TRUE)
age.bmi.charge2

Double-positive for Smoker and Obese among the highest medically charged.

--- title: "Medical Cost Analysis" description: "A statistical approach to predicting medical cost based on demographics and health conditions" author: "Karl Marquez" date: 2025-07-29 date-modified: 2025-08-01 image: thumbnail.JPG categories: - medical - code - clinical format: html: code-fold: true code-summary: "Show me the code" code-overflow: wrap code-tools: true toc: true toc-depth: 3 toc-expand: false toc-title: Contents toc-location: right number-sections: true css: "style.css" smooth-scroll: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, fig.align = "center") ``` # Introduction | The goal of this notebook is to characterize the dataset and ultimately help predict medical charges based on factors like age, sex, bmi, number of children, smoking status, and region of living. | Let us start with loading the dataset for this analysis. ```{r include=FALSE} library(ggplot2) library(dplyr) library(tidyr) library(lubridate) #to automatically guess the format of the date column library(writexl) library(corrplot) library(Hmisc) library(randomForest) library(tidyverse) library(readxl) library(plotly) library(ggforce) library(ggstatsplot) library(forcats) library(ggpubr) library(rstatix) library(DataExplorer) library(dlookr) library(flextable) library(prettydoc) library(DT) library(networkD3) library(ggrepel) library(leaflet) library(maps) library(ggcorrplot) library(lares) library(lmtest) library(ggsignif) ``` ```{r include=FALSE} karl_theme <- theme_minimal() + theme(plot.title = element_text(size=20, face="bold", family = "serif"), axis.title = element_text(size = 15, face="bold", family = "serif"), axis.text = element_text(size = 10, face="bold", family = "serif"), legend.title = element_text(size=12), legend.text = element_text(size=12, family = "serif"), panel.border = element_rect(colour = "gray", fill = NA, linetype = 2), panel.grid.major.x = element_line(colour = "darkgray", linetype = 2, linewidth = 0.5), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_line(colour = "darkgray", linetype = 2, linewidth = 0.5), panel.grid.minor.y = element_blank(), panel.spacing = unit(5, "lines")) ``` ```{r warning=FALSE} medicalcost <- read.csv("medical_cost.csv") medicalcost <- medicalcost %>% mutate(BMI.status = case_when( bmi < 18.5 ~ "Underweight", bmi >= 18.5 & bmi < 24.9 ~ "Normal", bmi >= 25 & bmi < 29.9 ~ "Overweight", bmi >= 30 ~ "Obese", TRUE ~ "Unknown")) %>% mutate(BMI.status = factor(BMI.status, ordered = TRUE, levels = c("Underweight", "Normal", "Overweight", "Obese", "Unknown"))) %>% mutate(children = factor(children, ordered = TRUE, levels = c("0", "1", "2", "3", "4", "5"))) %>% mutate(sex = factor(sex, ordered = TRUE, levels = c("male", "female"))) %>% mutate(smoker = factor(smoker, ordered = TRUE, levels = c("no", "yes"))) %>% mutate(region = factor(region, ordered = TRUE, levels = c("northeast", "northwest", "southeast", "southwest"))) ``` I added a variable to stratify the BMI into 4 different categories: Underweight, Normal, Overweight, and Obese. I also transformed the sex, bmi status, children, smoking status, region into categorical factors. <br> <br> # Exploratory Analysis ## Distribution and Univariate analysis {.tabset .tabset-pills .tabset-fade} ### Univariate Analysis1 ```{r fig.cap= "BMI is the only normally distributed variable. Medical charges are skewed to the right.", fig.align="center"} age.median <- median(medicalcost$age, na.rm = TRUE) age.mean <- mean(medicalcost$age, na.rm = TRUE) age.normality <- shapiro.test(medicalcost$age) age.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=age)) + geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) + geom_density(color = "#B31312", size = 1, linetype = 2) + geom_vline(xintercept = age.median, color="#054B71", linetype="dashed", linewidth=1) + annotate(geom="text", x=30, y=0.03, color="#054B71", size = 3, label=paste("Median\n", round(age.median, 1))) + geom_vline(xintercept = age.mean, color="#0086BD", linetype = "dashed", linewidth=1) + annotate(geom="text", x=50, y=0.03, color="#0086BD", size=3, label=paste("Mean\n", round(age.mean, 1))) + annotate(geom = "text", x=30, y=0.035, size=3, label=paste("Shapiro-Wilk\nNormality Test =", round(age.normality$p.value, 10))) + labs(x="Age", y="Count", title="Age Distribution") + karl_theme) ### BMI distribution bmi.median <- median(medicalcost$bmi, na.rm = TRUE) bmi.mean <- mean(medicalcost$bmi, na.rm = TRUE) bmi.normality <- shapiro.test(medicalcost$bmi) bmi.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=bmi)) + geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) + geom_density(color = "#B31312", size = 1, linetype = 2) + geom_vline(xintercept = bmi.median, color="#054B71", linetype="dashed", linewidth=1) + annotate(geom="text", x=45, y=0.03, color="#054B71", size = 3, label=paste("Median\n", round(bmi.median, 1))) + geom_vline(xintercept = bmi.mean, color="#0086BD", linetype = "dashed", linewidth=1) + annotate(geom="text", x=45, y=0.04, color="#0086BD", size=3, label=paste("Mean\n", round(bmi.mean, 1))) + annotate(geom = "text", x=45, y=0.06, size=3, label=paste("Shapiro-Wilk\nNormality Test =", round(bmi.normality$p.value, 10))) + labs(x="BMI", y="Count", title="BMI Distribution") + karl_theme) ### charges distribution charges.median <- median(medicalcost$charges, na.rm = TRUE) charges.mean <- mean(medicalcost$charges, na.rm = TRUE) charges.normality <- shapiro.test(medicalcost$charges) charges.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=charges)) + geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 15, alpha = 0.7) + geom_density(color = "#B31312", size = 1, linetype = 2) + geom_vline(xintercept = charges.median, color="#054B71", linetype="dashed", linewidth=1) + annotate(geom="text", x=30000, y=0.00002, color="#054B71", size = 3, label=paste("Median\n", round(charges.median, 1))) + geom_vline(xintercept = charges.mean, color="#0086BD", linetype = "dashed", linewidth=1) + annotate(geom="text", x=30000, y=0.00003, color="#0086BD", size=3, label=paste("Mean\n", round(charges.mean, 1))) + annotate(geom = "text", x=30000, y=0.00004, size=3, label=paste("Shapiro-Wilk\nNormality Test =", round(charges.normality$p.value, 10))) + labs(x="Charges ($)", y="Count", title="Charges ($) Distribution") + scale_x_continuous(breaks = c(10000, 20000, 30000, 40000, 50000, 60000), labels = c("$10k", "$20k", "$30k", "$40k", "$50k", "$60k")) + karl_theme) BMIstatus <- medicalcost %>% filter(BMI.status != "Unknown") BMIstatus <- ggplotly( ggplot(data = BMIstatus, aes(x=BMI.status)) + geom_bar(fill = "navy", color = "black", alpha = 0.7) + labs(x = "BMI status", y = "Count") + scale_x_discrete(breaks = c("Underweight", "Normal", "Overweight", "Obese"), labels = c("Under\nWeight", "Normal", "Under\nWeight", "Obese")) + karl_theme) distribution <- subplot( age.distribution, bmi.distribution, charges.distribution, BMIstatus, nrows = 2, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.12 ) distribution <- layout(distribution, title = list( text = "Age, BMI, and Medical Costs Distribution", font = list(size = 25, family= "serif", face = "bold", weight = "bold")), margin = list(t = 100)) distribution ``` ### Univariate Analysis2 ```{r warning=FALSE} children <- ggplotly( ggplot(data = medicalcost, aes(x = children)) + geom_bar(fill="navy", alpha = 0.7, color = "black") + labs(x="Number of Children", y="Count", title=" Number of Children") + karl_theme) ### Sex sex <- ggplotly( ggplot(data = medicalcost, aes(x = sex)) + geom_bar(fill="navy", alpha = 0.7, color = "black") + labs(x="Sex", y="Count", title="Sex") + karl_theme) ### region region <- ggplotly( ggplot(data = medicalcost, aes(x = region)) + geom_bar(fill="navy", alpha = 0.7, color = "black") + labs(x="Region", y="Count", title="Region") + scale_x_discrete(breaks = c("northeast", "northwest", "southeast", "southwest"), labels = c("North\nEast", "North\nWest", "South\nEast", "South\nWest")) + karl_theme) ### smoking status smoker <- ggplotly( ggplot(data = medicalcost, aes(x = smoker)) + geom_bar(fill="navy", alpha = 0.7, color = "black") + labs(x="Smoker", y="Count", title="Smoker") + karl_theme) categories <- subplot( children, sex, region, smoker, nrows = 2, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1 ) categories <- layout(categories, title = list( text = "Sex, Smoking Status, Number of Children, and Region", font = list(size = 25, family= "serif", face = "bold", weight = "bold"))) categories ``` <br> ## Continuous vs Categorical Variables {.tabset .tabset-pills .tabset-fade} ### Age distribution ```{r warning=FALSE} age.sex.comparison <- list(c("male", "female")) age.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -age, FUN = median), y=age)) + geom_violin(aes(fill = sex), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") + geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") + labs(x="Sex", y="Age", title = "Age Distribution by Sex") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + #stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + stat_compare_means(comparisons = age.sex.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 65, label.x = 1.5) + karl_theme + theme(legend.position = "none")) age.smoker.comparison <- list(c("no", "yes")) age.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -age, FUN = median), y=age)) + geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") + geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") + labs(x="Smoker", y="Age", title = "Age Distribution by Smoking Status") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 62.5, label.x = 1.5) + karl_theme + theme(legend.position = "none")) age.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"), c("northwest", "southeast"), c("northwest", "southwest"), c("southeast", "southwest")) age.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -age, FUN = median), y=age)) + geom_violin(aes(fill = region), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") + geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") + labs(x="Region", y="Age", title = "Age Distribution by Region") + scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.region.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 60, label.x = 2) + karl_theme + theme(legend.position = "none")) age.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"), c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"), c("2", "3"), c("2", "4"), c("2", "5"), c("3", "4"), c("3", "5"), c("4", "5")) age.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -age, FUN = median), y=age)) + geom_violin(aes(fill = children), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") + geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") + labs(x="Children", y="Age", title = "Age Distribution by Number of Children") + scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.children.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 65, label.x = 2) + karl_theme + theme(legend.position = "none")) age.categories <- subplot( age.sex.plot, age.smoker.plot, age.region.plot, age.children.plot, nrows = 2, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1 ) age.categories <- layout(age.categories, title = list( text = "Age Distribution among Categories", font = list(size = 25, family= "serif", face = "bold", weight = "bold"))) age.categories ``` ### BMI distribution ```{r warning=FALSE} bmi.sex.comparison <- list(c("male", "female")) bmi.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -bmi, FUN = median), y=bmi)) + geom_violin(aes(fill = sex), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") + geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") + labs(x="Sex", y="BMI", title = "BMI Distribution by Sex") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) + karl_theme + theme(legend.position = "none")) bmi.smoker.comparison <- list(c("no", "yes")) bmi.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -bmi, FUN = median), y=bmi)) + geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") + geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") + labs(x="Smoker", y="BMI", title = "BMI Distribution by Smoking Status") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) + karl_theme + theme(legend.position = "none")) bmi.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"), c("northwest", "southeast"), c("northwest", "southwest"), c("southeast", "southwest")) bmi.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -bmi, FUN = median), y=bmi)) + geom_violin(aes(fill = region), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") + geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") + labs(x="Region", y="BMI", title = "BMI Distribution by Region") + scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.region.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 50, label.x = 2) + karl_theme + theme(legend.position = "none")) bmi.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"), c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"), c("2", "3"), c("2", "4"), c("2", "5"), c("3", "4"), c("3", "5"), c("4", "5")) bmi.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -bmi, FUN = median), y=bmi)) + geom_violin(aes(fill = children), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") + geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") + labs(x="Children", y="BMI", title = "BMI Distribution by Number of Children") + scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.children.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 50, label.x = 3) + karl_theme + theme(legend.position = "none")) bmi.categories <- subplot( bmi.sex.plot, bmi.smoker.plot, bmi.region.plot, bmi.children.plot, nrows = 2, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1 ) bmi.categories <- layout(bmi.categories, title = list( text = "BMI Distribution among Categories", font = list(size = 25, family= "serif", face = "bold", weight = "bold"))) bmi.categories ``` ### Medical Cost distribution ```{r warning=FALSE} charges.sex.comparison <- list(c("male", "female")) charges.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -charges, FUN = median), y=charges)) + geom_violin(aes(fill = sex), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") + geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") + labs(x="Sex", y="Charges ($)", title = "Charges Distribution by Sex") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) + karl_theme + theme(legend.position = "none") + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k"))) charges.smoker.comparison <- list(c("no", "yes")) charges.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -charges, FUN = median), y=charges)) + geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") + geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") + labs(x="Smoker", y="Charges ($)", title = "Charges Distribution by Smoking Status") + scale_fill_manual(values = c("#1D24CA", "#B31312")) + scale_color_manual(values = c("#1D24CA", "#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") + stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) + karl_theme + theme(legend.position = "none") + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k"))) charges.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"), c("northwest", "southeast"), c("northwest", "southwest"), c("southeast", "southwest")) charges.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -charges, FUN = median), y=charges)) + geom_violin(aes(fill = region), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") + geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") + labs(x="Region", y="Charges ($)", title = "Charges Distribution by Region") + scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.region.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) + karl_theme + theme(legend.position = "none") + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k"))) charges.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"), c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"), c("2", "3"), c("2", "4"), c("2", "5"), c("3", "4"), c("3", "5"), c("4", "5")) charges.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -charges, FUN = median), y=charges)) + geom_violin(aes(fill = children), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") + geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") + labs(x="Children", y="Charges ($)", title = "Charges Distribution by Number of Children") + scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.children.comparison, method = "t.test") + stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) + karl_theme + theme(legend.position = "none") + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k"))) charges.categories <- subplot( charges.sex.plot, charges.smoker.plot, charges.region.plot, charges.children.plot, nrows = 2, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1 ) charges.categories <- layout(charges.categories, title = list( text = "Charges ($) Distribution among Categories", font = list(size = 25, family= "serif", face = "bold", weight = "bold"))) charges.categories ``` ### Age and Medical Charge ```{r warning=FALSE} age.BMIstatus.comparison <- list(c("Underweight", "Normal"), c("Underweight", "Overweight"), c("Underweight", "Obese"), c("Normal", "Overweight"), c("Normal", "Obese"), c("Overweight", "Obese")) age.bmi.status <- age.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -age, FUN = median), y=age)) + geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") + geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") + labs(x="BMI status", y="Age", title = "Age Distribution by BMI status") + scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) + scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.BMIstatus.comparison) + stat_compare_means(method = "anova", label.y = 62, label.x = 2) + karl_theme + theme(legend.position = "none")) charge.bmistatus <- charges.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -charges, FUN = median), y=charges)) + geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") + geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") + geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") + labs(x="BMI status", y="Charges ($)", title = "Charge Distribution by BMI status") + scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) + scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) + stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) + stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) + #stat_compare_means(comparisons = age.BMIstatus.comparison) + stat_compare_means(method = "anova", label.y = 60000, label.x = 1.5) + karl_theme + theme(legend.position = "none") + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k"))) BMIstatus <- subplot( age.bmi.status, charge.bmistatus, nrows = 1, shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.075 ) BMIstatus <- layout(BMIstatus, title = list( text = "Age and Medical Charges among different BMI classification", font = list(size = 25, family= "serif", face = "bold", weight = "bold"))) BMIstatus ``` <br> ## Continuous vs Continuous Variables {.tabset .tabset-pills .tabset-fade} ### Medical Cost vs Age by smoking status ```{r} age.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = age, y = charges)) + geom_point(alpha = 0.7, size = 5, color = "white", aes(fill = smoker)) + labs(title = "Smoker among the highest charged medically", x = "Age", y = "Medical Cost") + scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k")) + karl_theme + theme(legend.position.inside = c(0, 0))) age.vs.charge.by.smoker ``` ### Medical Cost vs Age by BMI status ```{r} BMIstatus <- medicalcost %>% filter(BMI.status != "Unknown") ggplotly(ggplot(data = BMIstatus, aes(x = age, y = charges)) + geom_point(alpha = 0.75, size = 3, color = "white", aes(fill = BMI.status)) + labs(title = "Obese among the highest charged medically") + scale_fill_manual(values = c("Underweight" = "#1D24CA", "Normal" = "#539165", "Overweight" = "#F7C04A", "Obese" = "#B31312")) + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k")) + facet_wrap(~ BMI.status) + karl_theme) ``` ### Medical Cost vs BMI by smoking status ```{r} bmi.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = bmi, y = charges)) + geom_point(alpha = 0.75, size = 5, color = "white", aes(fill = smoker)) + labs(title = "Smoker among the highest medically charged", x = "BMI", y = "Charges ($)") + scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) + scale_y_continuous(breaks = c(20000, 40000, 60000), labels = c("$20k", "$40k", "$60k")) + karl_theme) bmi.vs.charge.by.smoker ``` # Medical Cost can be explained by Smoking Status and BMI ## 3D Plots ```{r fig.cap= "Double-positive for Smoker and Obese among the highest medically charged."} BMIstatus.naremoved <- medicalcost %>% filter(BMI.status != "Unknown") %>% mutate(BMI.status = factor(BMI.status, levels = c("Underweight", "Normal", "Overweight", "Obese"))) age.bmi.charge2 <- plot_ly( data = BMIstatus.naremoved, type = "scatter3d", mode = "markers", x = ~age, y = ~bmi, z = ~charges, color = ~smoker, colors = c("no" = "#1D24CA", "yes" = "#B31312"), symbol = ~BMI.status, symbols = c("Underweight" = "circle", "Normal" = "square", "Overweight" = "diamond", "Obese" = "cross"), marker = list(opacity = 0.5), width = 900, height = 800) %>% layout( title = list( text = "3D ScatterPlot of Age, BMI, and Charges", font = list(size = 20)), scene = list( xaxis = list(title = "Age", titlefont = list(size = 10)), yaxis = list(title = "BMI", titlefont = list(size = 10)), zaxis = list(title = "Charges ($)", titlefont = list(size = 10))), legend = list(title = list(text = "Smoker Status", font = list(size = 10))), margin = list(l = 10, r = 10, b = 10, t = 40), autosize = TRUE) age.bmi.charge2 ```