Medical Cost Analysis

medical
code
clinical
A statistical approach to predicting medical cost based on demographics and health conditions
Author

Karl Marquez

Published

July 29, 2025

Modified

August 1, 2025

1 Introduction

  The goal of this notebook is to characterize the dataset and ultimately help predict medical charges based on factors like

age, sex, bmi, number of children, smoking status, and region of living.

  Let us start with loading the dataset for this analysis.
Show me the code
medicalcost <- read.csv("medical_cost.csv")
medicalcost <- medicalcost %>% 
  mutate(BMI.status = case_when(
    bmi < 18.5 ~ "Underweight",
    bmi >= 18.5 & bmi < 24.9 ~ "Normal",
    bmi >= 25 & bmi < 29.9 ~ "Overweight",
    bmi >= 30 ~ "Obese",
    TRUE ~ "Unknown")) %>%
  mutate(BMI.status = factor(BMI.status, ordered = TRUE,
                             levels = c("Underweight", "Normal", "Overweight", "Obese", "Unknown"))) %>% 
  mutate(children = factor(children, 
                           ordered = TRUE,
                           levels = c("0", "1", "2", "3", "4", "5"))) %>% 
  mutate(sex = factor(sex, ordered = TRUE, levels = c("male", "female"))) %>% 
  mutate(smoker = factor(smoker, ordered = TRUE, levels = c("no", "yes"))) %>% 
  mutate(region = factor(region, ordered = TRUE, levels = c("northeast", "northwest", "southeast", "southwest")))

I added a variable to stratify the BMI into 4 different categories: Underweight, Normal, Overweight, and Obese. I also transformed the sex, bmi status, children, smoking status, region into categorical factors.



2 Exploratory Analysis

2.1 Distribution and Univariate analysis

2.1.1 Univariate Analysis1

Show me the code
age.median <- median(medicalcost$age, na.rm = TRUE)
age.mean <- mean(medicalcost$age, na.rm = TRUE)
age.normality <- shapiro.test(medicalcost$age)
age.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=age)) +
                               geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
                               geom_density(color = "#B31312", size = 1, linetype = 2) +
                               geom_vline(xintercept = age.median, color="#054B71", linetype="dashed", linewidth=1) +
                               annotate(geom="text", x=30, y=0.03, color="#054B71", size = 3,
                                        label=paste("Median\n", round(age.median, 1))) +
                               geom_vline(xintercept = age.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                               annotate(geom="text", x=50, y=0.03, color="#0086BD", size=3,
                                        label=paste("Mean\n", round(age.mean, 1))) +
                               annotate(geom = "text", x=30, y=0.035, size=3,
                                        label=paste("Shapiro-Wilk\nNormality Test =", round(age.normality$p.value, 10))) +
                               labs(x="Age", y="Count", title="Age Distribution") +
                               karl_theme)
### BMI distribution
bmi.median <- median(medicalcost$bmi, na.rm = TRUE)
bmi.mean <- mean(medicalcost$bmi, na.rm = TRUE)
bmi.normality <- shapiro.test(medicalcost$bmi)
bmi.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=bmi)) +
                               geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
                               geom_density(color = "#B31312", size = 1, linetype = 2) +
                               geom_vline(xintercept = bmi.median, color="#054B71", linetype="dashed", linewidth=1) +
                               annotate(geom="text", x=45, y=0.03, color="#054B71", size = 3,
                                        label=paste("Median\n", round(bmi.median, 1))) +
                               geom_vline(xintercept = bmi.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                               annotate(geom="text", x=45, y=0.04, color="#0086BD", size=3,
                                        label=paste("Mean\n", round(bmi.mean, 1))) +
                               annotate(geom = "text", x=45, y=0.06, size=3,
                                        label=paste("Shapiro-Wilk\nNormality Test =", round(bmi.normality$p.value, 10))) +
                               labs(x="BMI", y="Count", title="BMI Distribution") +
                               karl_theme)
### charges distribution
charges.median <- median(medicalcost$charges, na.rm = TRUE)
charges.mean <- mean(medicalcost$charges, na.rm = TRUE)
charges.normality <- shapiro.test(medicalcost$charges)
charges.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=charges)) +
                                   geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 15, alpha = 0.7) +
                                   geom_density(color = "#B31312", size = 1, linetype = 2) +
                                   geom_vline(xintercept = charges.median, color="#054B71", linetype="dashed", linewidth=1) +
                                   annotate(geom="text", x=30000, y=0.00002, color="#054B71", size = 3,
                                            label=paste("Median\n", round(charges.median, 1))) +
                                   geom_vline(xintercept = charges.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
                                   annotate(geom="text", x=30000, y=0.00003, color="#0086BD", size=3,
                                            label=paste("Mean\n", round(charges.mean, 1))) +
                                   annotate(geom = "text", x=30000, y=0.00004, size=3,
                                            label=paste("Shapiro-Wilk\nNormality Test =", round(charges.normality$p.value, 10))) +
                                   labs(x="Charges ($)", y="Count", title="Charges ($) Distribution") +
                                   scale_x_continuous(breaks = c(10000, 20000, 30000, 40000, 50000, 60000),
                                                      labels = c("$10k", "$20k", "$30k", "$40k", "$50k", "$60k")) +
                                   karl_theme)

BMIstatus <- medicalcost %>% 
  filter(BMI.status != "Unknown")
BMIstatus <- ggplotly(
  ggplot(data = BMIstatus, aes(x=BMI.status)) +
    geom_bar(fill = "navy", color = "black", alpha = 0.7) +
    labs(x = "BMI status",
         y = "Count") +
    scale_x_discrete(breaks = c("Underweight", "Normal", "Overweight", "Obese"),
                     labels = c("Under\nWeight", "Normal", "Under\nWeight", "Obese")) +
    karl_theme)



distribution <- subplot(
  age.distribution, bmi.distribution, charges.distribution, BMIstatus, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.12
)
distribution <- layout(distribution, 
                     title = list(
                       text = "Age, BMI, and Medical Costs Distribution",
                       font = list(size = 25, family= "serif", face = "bold", weight = "bold")),
                     margin = list(t = 100))
distribution

BMI is the only normally distributed variable. Medical charges are skewed to the right.

2.1.2 Univariate Analysis2

Show me the code
children <- ggplotly(
  ggplot(data = medicalcost, aes(x = children)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Number of Children", y="Count", 
         title=" Number of Children") +
    karl_theme)

### Sex
sex <- ggplotly(
  ggplot(data = medicalcost, aes(x = sex)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Sex", y="Count", 
         title="Sex") +
    karl_theme)


### region

region <- ggplotly(
  ggplot(data = medicalcost, aes(x = region)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Region", y="Count", 
         title="Region") +
    scale_x_discrete(breaks = c("northeast", "northwest", "southeast", "southwest"),
                       labels = c("North\nEast", "North\nWest", "South\nEast", "South\nWest")) +
    karl_theme)

### smoking status
smoker <- ggplotly(
  ggplot(data = medicalcost, aes(x = smoker)) +
    geom_bar(fill="navy", alpha = 0.7, color = "black") +
    labs(x="Smoker", y="Count", 
         title="Smoker") +
    karl_theme)


categories <- subplot(
  children, sex, region, smoker, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
categories <- layout(categories, 
                       title = list(
                         text = "Sex, Smoking Status, Number of Children, and Region",
                         font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
categories


2.2 Continuous vs Categorical Variables

2.2.1 Age distribution

Show me the code
age.sex.comparison <- list(c("male", "female")) 
age.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -age, FUN = median), y=age)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="Age", title = "Age Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           #stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 65, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none"))


age.smoker.comparison <- list(c("no", "yes"))
age.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -age, FUN = median), y=age)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="Age", title = "Age Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 62.5, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none"))

age.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
age.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -age, FUN = median), y=age)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="Age", title = "Age Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 60, label.x = 2) +
                              karl_theme +
                              theme(legend.position = "none"))


age.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
age.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -age, FUN = median), y=age)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="Age", title = "Age Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 65, label.x = 2) +
                                karl_theme +
                                theme(legend.position = "none"))

age.categories <- subplot(
  age.sex.plot, age.smoker.plot, age.region.plot, age.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
age.categories <- layout(age.categories, 
                     title = list(
                       text = "Age Distribution among Categories",
                       font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
age.categories

2.2.2 BMI distribution

Show me the code
bmi.sex.comparison <- list(c("male", "female")) 
bmi.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -bmi, FUN = median), y=bmi)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="BMI", title = "BMI Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none"))

bmi.smoker.comparison <- list(c("no", "yes"))
bmi.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -bmi, FUN = median), y=bmi)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="BMI", title = "BMI Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none"))

bmi.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
bmi.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -bmi, FUN = median), y=bmi)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="BMI", title = "BMI Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 50, label.x = 2) +
                              karl_theme +
                              theme(legend.position = "none"))

bmi.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
bmi.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -bmi, FUN = median), y=bmi)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="BMI", title = "BMI Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 50, label.x = 3) +
                                karl_theme +
                                theme(legend.position = "none"))

bmi.categories <- subplot(
  bmi.sex.plot, bmi.smoker.plot, bmi.region.plot, bmi.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
bmi.categories <- layout(bmi.categories,
                         title = list(
                           text = "BMI Distribution among Categories",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
bmi.categories

2.2.3 Medical Cost distribution

Show me the code
charges.sex.comparison <- list(c("male", "female")) 
charges.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -charges, FUN = median), y=charges)) +
                           geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
                           geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
                           geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
                           labs(x="Sex", y="Charges ($)", title = "Charges Distribution by Sex") +
                           scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                           scale_color_manual(values = c("#1D24CA", "#B31312")) +
                           stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                           stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                           #stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
                           stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
                           karl_theme +
                           theme(legend.position = "none") +
                             scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                labels = c("$20k", "$40k", "$60k")))

charges.smoker.comparison <- list(c("no", "yes"))
charges.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -charges, FUN = median), y=charges)) +
                              geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Smoker", y="Charges ($)", title = "Charges Distribution by Smoking Status") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
                              stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
                              karl_theme +
                              theme(legend.position = "none") +
                                scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                   labels = c("$20k", "$40k", "$60k")))

charges.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
                              c("northwest", "southeast"), c("northwest", "southwest"),
                              c("southeast", "southwest"))
charges.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -charges, FUN = median), y=charges)) +
                              geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
                              geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
                              geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
                              labs(x="Region", y="Charges ($)", title = "Charges Distribution by Region") +
                              scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
                              stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                              stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                              #stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
                              stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
                              karl_theme +
                              theme(legend.position = "none") +
                                scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                   labels = c("$20k", "$40k", "$60k")))

charges.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
                                c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
                                c("2", "3"), c("2", "4"), c("2", "5"),
                                c("3", "4"), c("3", "5"), c("4", "5"))
charges.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -charges, FUN = median), y=charges)) +
                                geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
                                geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
                                geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
                                labs(x="Children", y="Charges ($)", title = "Charges Distribution by Number of Children") +
                                scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
                                stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                #stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
                                stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
                                karl_theme +
                                theme(legend.position = "none") +
                                  scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                     labels = c("$20k", "$40k", "$60k")))
charges.categories <- subplot(
  charges.sex.plot, charges.smoker.plot, charges.region.plot, charges.children.plot, nrows = 2,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
charges.categories <- layout(charges.categories,
                         title = list(
                           text = "Charges ($) Distribution among Categories",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
charges.categories

2.2.4 Age and Medical Charge

Show me the code
age.BMIstatus.comparison <- list(c("Underweight", "Normal"),
                                 c("Underweight", "Overweight"),
                                 c("Underweight", "Obese"),
                                 c("Normal", "Overweight"),
                                 c("Normal", "Obese"),
                                 c("Overweight", "Obese"))
age.bmi.status <- age.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -age, FUN = median), y=age)) +
                                 geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
                                 geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
                                 geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
                                 labs(x="BMI status", y="Age", title = "Age Distribution by BMI status") +
                                 scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                 #stat_compare_means(comparisons = age.BMIstatus.comparison) +
                                 stat_compare_means(method = "anova", label.y = 62, label.x = 2) +
                                 karl_theme +
                                 theme(legend.position = "none"))

charge.bmistatus <- charges.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -charges, FUN = median), y=charges)) +
                                 geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
                                 geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
                                 geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
                                 labs(x="BMI status", y="Charges ($)", title = "Charge Distribution by BMI status") +
                                 scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
                                 stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
                                 #stat_compare_means(comparisons = age.BMIstatus.comparison) +
                                 stat_compare_means(method = "anova", label.y = 60000, label.x = 1.5) +
                                 karl_theme +
                                 theme(legend.position = "none") +
                                   scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                      labels = c("$20k", "$40k", "$60k")))
BMIstatus <- subplot(
  age.bmi.status, charge.bmistatus, nrows = 1,
  shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.075
)
BMIstatus <- layout(BMIstatus,
                         title = list(
                           text = "Age and Medical Charges among different BMI classification",
                           font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
BMIstatus


2.3 Continuous vs Continuous Variables

2.3.1 Medical Cost vs Age by smoking status

Show me the code
age.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = age, y = charges)) +
                                      geom_point(alpha = 0.7, size = 5, color = "white", aes(fill = smoker)) +
                                      labs(title = "Smoker among the highest charged medically",
                                           x = "Age", y = "Medical Cost") +
                                      scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
                                      scale_y_continuous(breaks = c(20000, 40000, 60000),
                                                         labels = c("$20k", "$40k", "$60k")) +
                                      karl_theme +
                                      theme(legend.position.inside = c(0, 0)))
age.vs.charge.by.smoker

2.3.2 Medical Cost vs Age by BMI status

Show me the code
BMIstatus <- medicalcost %>% 
  filter(BMI.status != "Unknown")
ggplotly(ggplot(data = BMIstatus, aes(x = age, y = charges)) +
           geom_point(alpha = 0.75, size = 3, color = "white", aes(fill = BMI.status)) +
           labs(title = "Obese among the highest charged medically") +
           scale_fill_manual(values = c("Underweight" = "#1D24CA", 
                                         "Normal" = "#539165", 
                                         "Overweight" = "#F7C04A",
                                         "Obese" = "#B31312")) +
           scale_y_continuous(breaks = c(20000, 40000, 60000),
                                    labels = c("$20k", "$40k", "$60k")) +
           facet_wrap(~ BMI.status) +
           karl_theme)

2.3.3 Medical Cost vs BMI by smoking status

Show me the code
bmi.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = bmi, y = charges)) +
                            geom_point(alpha = 0.75, size = 5, color = "white", aes(fill = smoker)) +
                            labs(title = "Smoker among the highest medically charged",
                                 x = "BMI", y = "Charges ($)") +
                            scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
                            scale_y_continuous(breaks = c(20000, 40000, 60000),
                                               labels = c("$20k", "$40k", "$60k")) +
                            karl_theme)
bmi.vs.charge.by.smoker

3 Medical Cost can be explained by Smoking Status and BMI

3.1 3D Plots

Show me the code
BMIstatus.naremoved <- medicalcost %>% 
  filter(BMI.status != "Unknown") %>% 
  mutate(BMI.status = factor(BMI.status,
                                levels = c("Underweight", "Normal",
                                           "Overweight", "Obese")))

age.bmi.charge2 <- plot_ly(
  data = BMIstatus.naremoved, type = "scatter3d", mode = "markers",
  x = ~age, y = ~bmi, z = ~charges, 
  color = ~smoker, colors = c("no" = "#1D24CA", "yes" = "#B31312"),
  symbol = ~BMI.status, symbols = c("Underweight" = "circle",
                                    "Normal" = "square",
                                    "Overweight" = "diamond",
                                    "Obese" = "cross"),
  marker = list(opacity = 0.5), width = 900, height = 800) %>% 
  layout(
    title = list(
      text = "3D ScatterPlot of Age, BMI, and Charges", 
      font = list(size = 20)),
    scene = list(
      xaxis = list(title = "Age", titlefont = list(size = 10)),
      yaxis = list(title = "BMI", titlefont = list(size = 10)),
      zaxis = list(title = "Charges ($)", titlefont = list(size = 10))),
    legend = list(title = list(text = "Smoker Status", font = list(size = 10))),
    margin = list(l = 10, r = 10, b = 10, t = 40), 
    autosize = TRUE)
age.bmi.charge2

Double-positive for Smoker and Obese among the highest medically charged.