13 Multivariate Regression

13.1 Libraries

library(here)
library(tidyverse)
library(SLINGtools)
library(broom)
library(ggrepel)

13.2 Import Datasets

d_meta <- read_csv(here("data/ISLS10_Part2_metadata.csv"))
d_wide <- read_csv(here("data/ISLS10_Part2_lipidomics_curated.csv"))

13.3 Prepare Data

# Convert to long format
d_long <- d_wide |>
  pivot_longer(cols = -ID, names_to = "Compound" , values_to = "Conc")

# Combine data and metadata
d_full <- d_meta |> left_join(d_long, by = "ID")

# log-transform and scale (z-score) data
d_full <- d_full |>
  group_by(Compound) |>
  mutate(Conc_log = log2(Conc),
         Conc_logz = as.numeric(scale(Conc_log)))

13.4 Run regression for each lipid

In this example a logistic regression is used. The output of glm() is converted to a tidy table using the broom::tidy() function.

model <- as.formula("DM ~ Age + Gender + BMI + HbA1c + 
                    SBP + HDL + LDL + TG + Conc_logz")

d_res <- d_full %>%
  group_by(Compound) %>%
  nest() %>%
  mutate(
    models = map(data, function(x) glm(model, data = x, family = "binomial")), 
    tidy = map(models, function(x) broom::tidy(x))) |> 
  unnest(tidy) |> 
  dplyr::select(-data, -models)

The results contain the combined estimates, errors, and P values for each term for each lipid species.

Compound	term	estimate	std.error	statistic	p.value
Cer d16:1/16:0	(Intercept)	-20.955219	1.886056	-11.110601	0.000000
Cer d16:1/16:0	Age	0.032885	0.010896	3.017987	0.002545
Cer d16:1/16:0	Gender	0.593323	0.217179	2.731950	0.006296
Cer d16:1/16:0	BMI	0.129663	0.027600	4.698024	0.000003
Cer d16:1/16:0	HbA1c	2.041818	0.281859	7.244102	0.000000
Cer d16:1/16:0	SBP	0.017569	0.005370	3.271562	0.001070
Cer d16:1/16:0	HDL	-0.725890	0.368385	-1.970465	0.048785
Cer d16:1/16:0	LDL	-0.247450	0.128041	-1.932579	0.053288
Cer d16:1/16:0	TG	0.402731	0.142030	2.835530	0.004575
Cer d16:1/16:0	Conc_logz	0.197851	0.113746	1.739413	0.081962

To get the effects and P values for the lipids we filter for the term Conc_logz. We futhermore get the adjusted P values (FDR).

d_res_lipids <- d_res |> 
   filter(term == "Conc_logz") |> 
   mutate(FDR = p.adjust(p.value, method = "fdr")) |> 
   arrange(FDR)

Compound	term	estimate	std.error	statistic	p.value	FDR
SM d16:1/18:0	Conc_logz	0.408832	0.119650	3.416896	0.000633	0.000633
SM d18:1/18:0	Conc_logz	0.325500	0.105045	3.098661	0.001944	0.001944
Cer d18:0/18:0	Conc_logz	0.427791	0.139298	3.071052	0.002133	0.002133
Hex1Cer d18:2/25:0	Conc_logz	-0.255110	0.092979	-2.743742	0.006074	0.006074
Cer d18:1/18:0	Conc_logz	0.316480	0.119278	2.653288	0.007971	0.007971
SM d18:1/20:0	Conc_logz	0.282201	0.106676	2.645399	0.008159	0.008159

13.5 Forest Plot

Prepare data for the plot: get lipid class from lipid names, join it to the data frame, calculate FDR, get upper/lower errors, and set “significant specie”

# lipid annotation
d_lipid_annot <- d_res_lipids |>
  select(Compound) |>  
  separate(Compound, 
           into = c("lipid_class", "fa_chain"), 
           remove = FALSE, 
           extra = "drop",
           fill = "right",  
           sep = "/" )

d_plot <- d_res_lipids |> 
  full_join(d_lipid_annot, by = "Compound") |> 
  mutate(
    est_low = estimate - std.error,
    est_high = estimate + std.error, 
    signif = FDR < 0.05,
    label = if_else(signif, fa_chain, "")) |> 
  arrange(signif)

Plot estimates of all lipid species, grouped by lipid class, highlighting significant species in red and label them by the FA chain.

ggplot(d_plot, aes(x = lipid_class, y = estimate,color = signif, label = label)) +
  geom_hline(yintercept = 0) +
  geom_pointrange(aes(ymin = est_low, ymax = est_high, alpha = signif),
                  size = 0.5,
                  position = position_jitter(width = .3, height = 0)) +
  coord_flip() +
  scale_color_manual(values = c("FALSE" = "grey70", "TRUE" = "red")) +
  scale_alpha_manual(values = c("FALSE" = 0.3, "TRUE" = 1)) +
  ggrepel::geom_text_repel(
    aes(y = estimate),
    size = 2,
    max.overlaps = 5,
    point.padding = .7
  ) +
  theme_bw() + 
  theme(legend.position="none")

Figure 1. Forest plot of logistic regression (DM ~ Age + Gender + BMI + HbA1c + SBP + HDL + LDL + TG + Lipid). Species with FDR > 0.05 are highlighted in red.