source(here::here("scripts/load.R"))
Relationship Status and the Importance of Friends
Before we start, let’s execute a helper script that loads the necessary dependencies.
Overview
This document contains example 2, in which we analyze the association between relationship status and the importance people assign to friends. Data can be retrieved from the OSF but is also included in the downloadable replication package.
It’s a common complaint that people who enter a relationship start to neglect their friends. Here, we are going to use this to motivate an associational research question: Do people in romantic relationships, on average, assign less importance to their friends? To address this question, we analyze data that were collected in the context of a diary study on satisfaction with various aspects of life (Rohrer et al., 2024). In this study, 482 people reported whether they were in a romantic relationship of any kind (partner) and also the extent to which they considered their friendships important (friendship_importance) on a scale from 1 (not important at all) to 5 (very important).
Read and clean the data
# Read the data
<- read.csv(here("data/start.csv"))
dat
# Restrict to range that we can model well
<- dat[dat$sex != 3, ] # exclude 5 people who reported a gender distinct from male/female
dat <- dat[dat$age < 60, ] # exclude people over the age of 60
dat
# Limit to complete cases
<- dat[complete.cases(dat[, c("age", "sex", "partner_any", "IMP_friends_Start")]),]
dat
$sex <- as.factor(dat$sex)
dat
# Look at the variables
table(dat$age)
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
15 28 29 20 15 18 13 16 16 12 4 14 8 3 6 6 13 18 11 9 12 13 6 7 11 6 5 3 11 12 12 7 15 13 14 7 5 11 9 11 10 8
table(dat$sex)
1 2
347 135
table(dat$partner_any)
0 1
203 279
# Rename some variables
<- dat |>
dat mutate(sex = factor(sex, levels = c(1, 2), labels = c("female", "male"))) |>
rename(partner = partner_any,
gender = sex,
friendship_importance = IMP_friends_Start)
Visualize associations with age
This code produces Figure 4: Predicted importance of friends by age in three different simple linear models that only include age as a predictor, including (1) age as a linear predictor or (2) age as a categorical predictor, or (3) age splines (B-splines with four degrees of freedom).
Respondents’ age varies from 18 to 59 years. How do we best include this variable in our analysis? If we simply include it as a linear predictor, we assume that friendship_importance changes with age in a linear manner. If, instead, we include it as a categorical predictor (treating each year of age as its own category), we do not impose any assumptions about the functional form—but for some years, we only have few observations, resulting in a trajectory that jumps around a lot, with wide confidence intervals.
So, we may prefer a solution that lies somewhere between these two options. Contenders may be using coarser age categories or using polynomials. A third alternative is splines. These result in flexible, locally smooth trajectories. Unlike polynomials, splines enforce no global functional forms; unlike age categories, splines do not result in abrupt jumps in the trajectory.
# Linear association
<- lm(friendship_importance ~ age, data = dat)
age_lin # Each year of age as its own category
<- lm(friendship_importance ~ as.factor(age), data = dat)
age_cat # Smoothed with splines
<- lm(friendship_importance ~ bs(age, df = 4), data = dat)
age_smooth
# Extract predictions from each of the three models
<- avg_predictions(age_lin, by = "age")
pred_lin <- avg_predictions(age_cat, by = "age")
pred_cat <- avg_predictions(age_smooth, by = "age")
pred_smooth
# The following code generates a grid that expands the categorical predictions
# so that we can plot them as a step function including a step ribbon
# (This is just to get the visuals of Figure 4 right and nothing substantive)
<- data.frame(matrix(NA, nrow = length(rep(pred_cat$estimate, each = 100)), ncol = 4))
pred_cat_expanded names(pred_cat_expanded) <- c("age", "estimate", "conf.low", "conf.high")
$age <- seq(from = min(pred_cat$age), to = (max(pred_cat$age) + 0.999999), length.out = nrow(pred_cat_expanded))
pred_cat_expanded$estimate <- rep(pred_cat$estimate, each = 100)
pred_cat_expanded$conf.low <- rep(pred_cat$conf.low, each = 100)
pred_cat_expanded$conf.high <- rep(pred_cat$conf.high, each = 100)
pred_cat_expanded
# Color scheme for Figure 4
<- "#E69F00"
col_cat <- "#D55E00"
col_lin <- "#CC79A7"
col_smooth
# Generate the plot
ggplot() +
# categorical
geom_line(data = pred_cat_expanded, aes(x = age, y = estimate, group = floor(age)), color = col_cat) +
geom_ribbon(data = pred_cat_expanded, aes(x = age, ymin = conf.low, ymax = conf.high), alpha = .2, fill = col_cat) +
# linear
geom_line(data = pred_lin, aes(x = age, y = estimate), color = col_lin) +
geom_ribbon(data = pred_lin, aes(x = age, ymin = conf.low, ymax = conf.high), alpha = .2, fill = col_lin) +
# smoothed
geom_line(data = pred_smooth, aes(x = age, y = estimate), color = col_smooth) +
geom_ribbon(data = pred_smooth, aes(x = age, ymin = conf.low, ymax = conf.high), alpha = .2, fill = col_smooth) +
coord_cartesian(ylim = c(2.5, 5)) +
xlab("Age") +
ylab("Importance of friends (95% CI)")
# ggsave(here("plots/age.png"), width = 4, height = 3)
Model fitting and interpretation with marginaleffects
<- lm(friendship_importance ~ bs(age, df = 4) + gender + partner +
mod bs(age, df = 4):gender + partner:gender + bs(age, df = 4):partner, data = dat)
# Predictions for a 20 year old single
predictions(mod, newdata = datagrid(age = 20, gender = "male", partner = 0))
age | gender | partner | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|---|---|
20 | male | 0 | 4.66 | 0.243 | 19.2 | <0.001 | 269.4 | 4.18 | 5.13 |
# Slope of age for a 20 year old single
slopes(mod, variables = "age", newdata = datagrid(age = 20, gender = "male", partner = 0))
age | gender | partner | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|---|---|
20 | male | 0 | 0.00329 | 0.104 | 0.0318 | 0.975 | 0.0 | -0.2 | 0.207 |
# Predictions for everybody in the data
predictions(mod)
Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|
4.46 | 0.1485 | 30.0 | <0.001 | 654.5 | 4.17 | 4.75 |
4.58 | 0.1362 | 33.6 | <0.001 | 819.9 | 4.31 | 4.85 |
4.45 | 0.1131 | 39.3 | <0.001 | Inf | 4.23 | 4.67 |
4.41 | 0.0994 | 44.4 | <0.001 | Inf | 4.22 | 4.61 |
4.52 | 0.1447 | 31.2 | <0.001 | 709.7 | 4.24 | 4.81 |
4.66 | 0.1341 | 34.8 | <0.001 | 876.7 | 4.40 | 4.92 |
4.63 | 0.1872 | 24.7 | <0.001 | 446.4 | 4.26 | 5.00 |
4.46 | 0.1480 | 30.1 | <0.001 | 659.1 | 4.17 | 4.75 |
4.46 | 0.1480 | 30.1 | <0.001 | 659.1 | 4.17 | 4.75 |
4.67 | 0.1104 | 42.2 | <0.001 | Inf | 4.45 | 4.88 |
# Main result: Difference in friendship importance with and without partner, holding constant age and gender
avg_comparisons(mod, variables = "partner")
Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|
-0.072 | 0.0804 | -0.896 | 0.37 | 1.4 | -0.23 | 0.0856 |
# Generate separate estimates of the difference, by gender
avg_comparisons(mod, variables = "partner", by = "gender")
gender | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|
female | -0.0738 | 0.0946 | -0.780 | 0.436 | 1.2 | -0.259 | 0.112 |
male | -0.0675 | 0.1529 | -0.442 | 0.659 | 0.6 | -0.367 | 0.232 |
# Compare these gender-specific estimates
avg_comparisons(mod, variables = "partner", by = "gender", hypothesis = "b2 - b1 = 0")
Hypothesis | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|
b2-b1=0 | 0.00625 | 0.18 | 0.0348 | 0.972 | 0.0 | -0.346 | 0.359 |
# For the sake of completeness, we may also generate age-specific estimates
# Note that the wiggliness of this line will depend on how we modeled age
<- avg_comparisons(mod, variables = "partner", by = "age")
comp
ggplot(comp, aes(x = age, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_line() +
geom_ribbon(alpha = .1) +
geom_hline(yintercept = 0) +
ylab("Association partner and friendship importance")
Ordinal robustness check
So far, we have simply conducted linear regressions, but that may be considered suspect given the nature of the outcome: it’s just a five-point response scale ranging from not important at all to very important. And, in fact, barely anybody used the lower response options—more than 40% picked the highest response option. This results in a distribution for which the assumptions of linear regression may be considered unrealistic.
So, let’s run an ordinal regression to see whether conclusions change.
Here, we are going to fit a cumulative ordinal model with a probit link using the brms package (Bürkner, 2018). In essence, this approach assumes a continuous, normally distributed standardized latent variable (“true” friendship_importance) which is translated into the ordinal response variable following thresholds that are estimated from the data; for example, people who score -3 or less on the standardized latent variable may report that their friends are not important at all, people who score more than that but below -2.7 may pick the second lowest response option, and so on (see Bürkner & Vuorre, 2019 for a proper introduction to these models).
The rest of the model specification remains unchanged:
We can evaluate the association of interest using the same average comparison as before:
avg_comparisons(mod_ord, variables = "partner")
Group | Estimate | 2.5 % | 97.5 % |
---|---|---|---|
1 | 0.00132 | -0.00798 | 0.0115 |
2 | 0.00338 | -0.00757 | 0.0156 |
3 | 0.02108 | -0.01542 | 0.0597 |
4 | 0.02925 | -0.00152 | 0.0613 |
5 | -0.05541 | -0.13218 | 0.0181 |
The resulting output, however, differs. By default, the output shows how the probability of any response category of friendship_importance when partner changes from 0 to 1. The response categories 1 to 4 become slightly more likely with partner = 1, whereas the probability of giving the highest rating, 5, decreases by 5.5 percentage points.
We can also compute the comparison on the assumed underlying latent variable, which is the scale on which the model coefficients are reported in the regression output:
avg_comparisons(mod_ord, variables = "partner", type = "link")
Estimate | 2.5 % | 97.5 % |
---|---|---|
-0.148 | -0.364 | 0.057 |
And, as promised in the main text, we can combine these category-specific estimates on the averaged scale.
# Take the response probabilities and attach the integers 1:5 to the consecutive
# response categories to arrive at the same metric as the linear model
avg_comparisons(mod_ord, variables = "partner", hypothesis = ~ I(sum(x * 1:5)))
Estimate | 2.5 % | 97.5 % |
---|---|---|
-0.0883 | -0.244 | 0.062 |
While this ensures that we ask the ordinal model precisely the same answer that we asked the linear model, it may appear a bit inconsistent given that we usually use ordinal models precisely because we don’t want to assign integer values to the response categories – although note that here we do it only in the very last step, so the model itself does not assume that the distances between adjacent response categories are the same.
Frequentist ordinal robustness check
# Outcome as factor variable which is what clm() expects
$friendship_importance_factor <- factor(dat$friendship_importance, ordered = TRUE)
dat
# Fit the model
<- clm(friendship_importance_factor ~ bs(age, df = 4) + gender + partner +
mod_ord bs(age, df = 4):gender + partner:gender + bs(age, df = 4):partner,
data = dat,
link = "probit")
# Evaluate central comparison in terms of change in response probabilities
avg_comparisons(mod_ord, variables = "partner")
Group | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|
1 | 0.00170 | 0.00436 | 0.390 | 0.6963 | 0.5 | -0.00685 | 0.0103 |
2 | 0.00353 | 0.00555 | 0.636 | 0.5247 | 0.9 | -0.00735 | 0.0144 |
3 | 0.02129 | 0.01963 | 1.085 | 0.2780 | 1.8 | -0.01718 | 0.0598 |
4 | 0.02896 | 0.01641 | 1.765 | 0.0776 | 3.7 | -0.00320 | 0.0611 |
5 | -0.05549 | 0.03847 | -1.442 | 0.1492 | 2.7 | -0.13089 | 0.0199 |
# Evaluate effect on the underlying latent variable
avg_comparisons(mod_ord, variables = "partner", type = "linear.predictor")
Group | Estimate | Std. Error | z | Pr(>|z|) | S | 2.5 % | 97.5 % |
---|---|---|---|---|---|---|---|
1 | 0.147 | 0.107 | 1.38 | 0.169 | 2.6 | -0.0623 | 0.355 |
2 | 0.147 | 0.107 | 1.38 | 0.169 | 2.6 | -0.0623 | 0.355 |
3 | 0.147 | 0.107 | 1.38 | 0.169 | 2.6 | -0.0623 | 0.355 |
4 | 0.147 | 0.107 | 1.38 | 0.169 | 2.6 | -0.0623 | 0.355 |
5 | 0.147 | 0.107 | 1.37 | 0.170 | 2.6 | -0.0627 | 0.356 |
# Note that this estimate has the wrong sign
# Likely to an upstream bug in clm()
# Manually reconstruct
$friendship_importance_factor <- factor(dat$friendship_importance, ordered = TRUE)
dat<- clm(friendship_importance_factor ~ bs(age, df = 4) + gender + partner +
mod_ord bs(age, df = 4):gender + partner:gender + bs(age, df = 4):partner,
data = dat,
link = "probit")
<- transform(dat, partner = 0)
d0 <- transform(dat, partner = 1)
d1 <- predict(mod_ord, newdata = d0, type = "linear.predictor")
p0 <- predict(mod_ord, newdata = d1, type = "linear.predictor")
p1 mean(p1$eta1 - p0$eta1)
[1] 0.1465469
# If we do this manually, we also get the wrong sign
# So something seems wrong about the predictions that clm returns
Varying the complexity of the linear model to see what happens
Let’s vary model complexity to see what happens to our target quantity, the association between having a partner and friendship importance (holding constant age and gender).
Age variations
# No age in the model
<- lm(friendship_importance ~ gender + partner +
mod_age_0 :gender, data = dat)
partner<- avg_comparisons(mod_age_0, variables = "partner")
comp_age_0 $model <- "Not included"
comp_age_0
# Linear age in the model
<- lm(friendship_importance ~ age + gender + partner +
mod_age_1 :gender + partner:gender + age:partner, data = dat)
age<- avg_comparisons(mod_age_1, variables = "partner")
comp_age_1 $model <- "Linear"
comp_age_1
# Quadratic age on top
<- lm(friendship_importance ~ poly(age, 2) + gender + partner +
mod_age_2 poly(age, 2):gender + partner:gender + poly(age, 2):partner, data = dat)
<- avg_comparisons(mod_age_2, variables = "partner")
comp_age_2 $model <- "Quadratic"
comp_age_2
# Cubic age on top
<- lm(friendship_importance ~ poly(age, 3) + gender + partner +
mod_age_3 poly(age, 3):gender + partner:gender + poly(age, 3):partner, data = dat)
<- avg_comparisons(mod_age_3, variables = "partner")
comp_age_3 $model <- "Cubic"
comp_age_3
# Quartic age on tope
<- lm(friendship_importance ~ poly(age, 4) + gender + partner +
mod_age_4 poly(age, 4):gender + partner:gender + poly(age, 4):partner, data = dat)
<- avg_comparisons(mod_age_4, variables = "partner")
comp_age_4 $model <- "Quartic"
comp_age_4
# Quintic age on tope
<- lm(friendship_importance ~ poly(age, 5) + gender + partner +
mod_age_5 poly(age, 5):gender + partner:gender + poly(age, 5):partner, data = dat)
<- avg_comparisons(mod_age_4, variables = "partner")
comp_age_5 $model <- "Quintic"
comp_age_5
# Age with splines, df = 3
<- lm(friendship_importance ~ bs(age, df = 3) + gender + partner +
mod_age_6 bs(age, df = 3):gender + partner:gender + bs(age, df = 3):partner, data = dat)
<- avg_comparisons(mod_age_6, variables = "partner")
comp_age_6 $model <- "Splines, df = 3"
comp_age_6
# Splines, df = 4 (same as model reported in main text)
<- lm(friendship_importance ~ bs(age, df = 4) + gender + partner +
mod_age_7 bs(age, df = 3):gender + partner:gender + bs(age, df = 3):partner, data = dat)
<- avg_comparisons(mod_age_7, variables = "partner")
comp_age_7 $model <- "Splines, df = 4"
comp_age_7
# Splines, df = 5
<- lm(friendship_importance ~ bs(age, df = 5) + gender + partner +
mod_age_8 bs(age, df = 5):gender + partner:gender + bs(age, df = 5):partner, data = dat)
<- avg_comparisons(mod_age_8, variables = "partner")
comp_age_8 $model <- "Splines, df = 5"
comp_age_8
# Splines, df = 6
<- lm(friendship_importance ~ bs(age, df = 6) + gender + partner +
mod_age_9 bs(age, df = 6):gender + partner:gender + bs(age, df = 6):partner, data = dat)
<- avg_comparisons(mod_age_8, variables = "partner")
comp_age_9 $model <- "Splines, df = 6"
comp_age_9
# Put all the comparisons from the different model
# into one dataframe for plotting purposes
<- rbind(data.frame(comp_age_0), data.frame(comp_age_1),
comp_age data.frame(comp_age_2), data.frame(comp_age_3),
data.frame(comp_age_4), data.frame(comp_age_5),
data.frame(comp_age_6), data.frame(comp_age_7),
data.frame(comp_age_8), data.frame(comp_age_9))
# Numbering (for plotting purposes)
$no <- 1:nrow(comp_age)
comp_age
# Generate the plot
ggplot(comp_age, aes(x = no, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_point() +
geom_errorbar() +
geom_hline(yintercept = 0) +
coord_cartesian(ylim = c(-0.4, 0.4)) +
scale_x_continuous(breaks = comp_age$no, labels = comp_age$model) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Way that age is included in model") +
ylab("Target quantity (95% CI)")
# ggsave(here("plots/age_robustness.png"), width = 4, height = 3)
Interaction variations
Let’s see whether the inclusion of interactions makes a difference
# No interactions
<- lm(friendship_importance ~ bs(age, df = 4) + gender + partner, data = dat)
mod_interact_1 <- avg_comparisons(mod_interact_1, variables = "partner")
comp_interact_1 $model <- "No interaction"
comp_interact_1
# Add two-way interactions (same as model reported in main text)
<- lm(friendship_importance ~ bs(age, df = 4) + gender + partner +
mod_interact_2 bs(age, df = 4):gender + partner:gender + bs(age, df = 4):partner, data = dat)
<- avg_comparisons(mod_interact_2, variables = "partner")
comp_interact_2 $model <- "Two-way interactions"
comp_interact_2
# Add three-way interaction
<- lm(friendship_importance ~ bs(age, df = 4)*gender*partner, data = dat)
mod_interact_3 <- avg_comparisons(mod_interact_3, variables = "partner")
comp_interact_3 $model <- "Three-way interaction"
comp_interact_3
# Put all the comparisons from the different model
# into one dataframe for plotting purposes
<- rbind(data.frame(comp_interact_1), data.frame(comp_interact_2), data.frame(comp_interact_3))
comp_interact
# Numbering (for plotting purposes)
$no <- 1:nrow(comp_interact)
comp_interact
# Plot the results
ggplot(comp_interact, aes(x = no, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_point() +
geom_errorbar() +
geom_hline(yintercept = 0) +
coord_cartesian(ylim = c(-0.4, 0.4)) +
scale_x_continuous(breaks = comp_interact$no, labels = comp_interact$model) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Interactions included") +
ylab("Estimated difference in friendship importance")