library(tidyverse)
library(palmerpenguins)
# minimal clean dataset for examples
penguins_clean <- penguins %>%
select(species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex) %>%
drop_na()Extra wrangling data with dplyr
Using across() and where()
Use across() to apply the same transformation or summary to multiple columns; use where() to select columns by predicate (e.g., is.numeric, is.character).
# 1) mutate with across + where: convert numeric measurements to centimeters
penguins_clean %>%
mutate(across(where(is.numeric), .fns = \(x){x / 10}, .names = "{.col}_cm")) %>%
select(species, ends_with("_cm")) %>%
slice_head(n = 6)# A tibble: 6 × 5
species bill_length_mm_cm bill_depth_mm_cm flipper_length_mm_cm body_mass_g_cm
<fct> <dbl> <dbl> <dbl> <dbl>
1 Adelie 3.91 1.87 18.1 375
2 Adelie 3.95 1.74 18.6 380
3 Adelie 4.03 1.8 19.5 325
4 Adelie 3.67 1.93 19.3 345
5 Adelie 3.93 2.06 19 365
6 Adelie 3.89 1.78 18.1 362.
# 2) mutate with across on character columns: normalize text
penguins_clean %>%
mutate(across(where(is.character), ~ str_to_title(.))) %>%
slice_head(n = 6)# A tibble: 6 × 7
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
# ℹ 1 more variable: sex <fct>
# 3) summarise with across: compute mean and sd for all numeric columns per species
penguins_clean %>%
group_by(species) %>%
summarise(across(where(is.numeric),
list(mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)),
.names = "{.col}_{.fn}"))# A tibble: 3 × 9
species bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean
<fct> <dbl> <dbl> <dbl>
1 Adelie 38.8 2.66 18.3
2 Chinstrap 48.8 3.34 18.4
3 Gentoo 47.6 3.11 15.0
# ℹ 5 more variables: bill_depth_mm_sd <dbl>, flipper_length_mm_mean <dbl>,
# flipper_length_mm_sd <dbl>, body_mass_g_mean <dbl>, body_mass_g_sd <dbl>
# 4) summarise with across + select helpers: mean of bill measurements
penguins_clean %>%
group_by(species) %>%
summarise(across(starts_with("bill_"), mean, na.rm = TRUE, .names = "mean_{.col}"))Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(starts_with("bill_"), mean, na.rm = TRUE, .names =
"mean_{.col}")`.
ℹ In group 1: `species = Adelie`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 3 × 3
species mean_bill_length_mm mean_bill_depth_mm
<fct> <dbl> <dbl>
1 Adelie 38.8 18.3
2 Chinstrap 48.8 18.4
3 Gentoo 47.6 15.0
.names examples
1) When you supply a named list of functions, {.fn} uses the names you provided
penguins_clean %>%
group_by(species) %>%
summarise(
across(where(is.numeric),
list(avg = ~ mean(.x, na.rm = TRUE),
sdev = ~ sd(.x, na.rm = TRUE)),
.names = "{.col}_{.fn}") # e.g. bill_length_mm_avg, bill_length_mm_sdev
)# A tibble: 3 × 9
species bill_length_mm_avg bill_length_mm_sdev bill_depth_mm_avg
<fct> <dbl> <dbl> <dbl>
1 Adelie 38.8 2.66 18.3
2 Chinstrap 48.8 3.34 18.4
3 Gentoo 47.6 3.11 15.0
# ℹ 5 more variables: bill_depth_mm_sdev <dbl>, flipper_length_mm_avg <dbl>,
# flipper_length_mm_sdev <dbl>, body_mass_g_avg <dbl>, body_mass_g_sdev <dbl>
- For a single function {.fn} defaults to the function name unless you hardcode a label
penguins_clean %>%
group_by(species) %>%
summarise(
across(starts_with("bill_"),
mean, na.rm = TRUE,
.names = "mean_{.col}") # e.g. mean_bill_length_mm
)# A tibble: 3 × 3
species mean_bill_length_mm mean_bill_depth_mm
<fct> <dbl> <dbl>
1 Adelie 38.8 18.3
2 Chinstrap 48.8 18.4
3 Gentoo 47.6 15.0
- Use .names to add units/formatting when mutating multiple columns
penguins_clean %>%
mutate(across(starts_with("bill_"),
~ .x / 10,
.names = "{.col}_cm")) %>% # e.g. bill_length_mm_cm
select(species, ends_with("_cm")) %>%
slice_head(n = 6)# A tibble: 6 × 3
species bill_length_mm_cm bill_depth_mm_cm
<fct> <dbl> <dbl>
1 Adelie 3.91 1.87
2 Adelie 3.95 1.74
3 Adelie 4.03 1.8
4 Adelie 3.67 1.93
5 Adelie 3.93 2.06
6 Adelie 3.89 1.78
Notes - across() replaced many uses of mutate_at()/summarise_at() and is the recommended tidyverse approach. - where() is handy when you want to target all numeric or character columns without naming them explicitly. - Use .names in across() to control output column naming and avoid collisions.