Extra wrangling data with dplyr

library(tidyverse)
library(palmerpenguins)

# minimal clean dataset for examples
penguins_clean <- penguins %>%
  select(species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex) %>%
  drop_na()

Using across() and where()

Use across() to apply the same transformation or summary to multiple columns; use where() to select columns by predicate (e.g., is.numeric, is.character).

# 1) mutate with across + where: convert numeric measurements to centimeters
penguins_clean %>%
  mutate(across(where(is.numeric), .fns = \(x){x / 10}, .names = "{.col}_cm")) %>%
  select(species, ends_with("_cm")) %>%
  slice_head(n = 6)
# A tibble: 6 × 5
  species bill_length_mm_cm bill_depth_mm_cm flipper_length_mm_cm body_mass_g_cm
  <fct>               <dbl>            <dbl>                <dbl>          <dbl>
1 Adelie               3.91             1.87                 18.1           375 
2 Adelie               3.95             1.74                 18.6           380 
3 Adelie               4.03             1.8                  19.5           325 
4 Adelie               3.67             1.93                 19.3           345 
5 Adelie               3.93             2.06                 19             365 
6 Adelie               3.89             1.78                 18.1           362.
# 2) mutate with across on character columns: normalize text
penguins_clean %>%
  mutate(across(where(is.character), ~ str_to_title(.))) %>%
  slice_head(n = 6)
# A tibble: 6 × 7
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           36.7          19.3               193        3450
5 Adelie  Torgersen           39.3          20.6               190        3650
6 Adelie  Torgersen           38.9          17.8               181        3625
# ℹ 1 more variable: sex <fct>
# 3) summarise with across: compute mean and sd for all numeric columns per species
penguins_clean %>%
  group_by(species) %>%
  summarise(across(where(is.numeric),
                   list(mean = ~ mean(.x, na.rm = TRUE),
                        sd   = ~ sd(.x, na.rm = TRUE)),
                   .names = "{.col}_{.fn}"))
# A tibble: 3 × 9
  species   bill_length_mm_mean bill_length_mm_sd bill_depth_mm_mean
  <fct>                   <dbl>             <dbl>              <dbl>
1 Adelie                   38.8              2.66               18.3
2 Chinstrap                48.8              3.34               18.4
3 Gentoo                   47.6              3.11               15.0
# ℹ 5 more variables: bill_depth_mm_sd <dbl>, flipper_length_mm_mean <dbl>,
#   flipper_length_mm_sd <dbl>, body_mass_g_mean <dbl>, body_mass_g_sd <dbl>
# 4) summarise with across + select helpers: mean of bill measurements
penguins_clean %>%
  group_by(species) %>%
  summarise(across(starts_with("bill_"), mean, na.rm = TRUE, .names = "mean_{.col}"))
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(starts_with("bill_"), mean, na.rm = TRUE, .names =
  "mean_{.col}")`.
ℹ In group 1: `species = Adelie`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 3 × 3
  species   mean_bill_length_mm mean_bill_depth_mm
  <fct>                   <dbl>              <dbl>
1 Adelie                   38.8               18.3
2 Chinstrap                48.8               18.4
3 Gentoo                   47.6               15.0

.names examples

1) When you supply a named list of functions, {.fn} uses the names you provided

penguins_clean %>%
    group_by(species) %>%
    summarise(
        across(where(is.numeric),
                     list(avg = ~ mean(.x, na.rm = TRUE),
                                sdev = ~ sd(.x, na.rm = TRUE)),
                     .names = "{.col}_{.fn}")   # e.g. bill_length_mm_avg, bill_length_mm_sdev
    )
# A tibble: 3 × 9
  species   bill_length_mm_avg bill_length_mm_sdev bill_depth_mm_avg
  <fct>                  <dbl>               <dbl>             <dbl>
1 Adelie                  38.8                2.66              18.3
2 Chinstrap               48.8                3.34              18.4
3 Gentoo                  47.6                3.11              15.0
# ℹ 5 more variables: bill_depth_mm_sdev <dbl>, flipper_length_mm_avg <dbl>,
#   flipper_length_mm_sdev <dbl>, body_mass_g_avg <dbl>, body_mass_g_sdev <dbl>
  1. For a single function {.fn} defaults to the function name unless you hardcode a label
penguins_clean %>%
    group_by(species) %>%
    summarise(
        across(starts_with("bill_"),
                     mean, na.rm = TRUE,
                     .names = "mean_{.col}")   # e.g. mean_bill_length_mm
    )
# A tibble: 3 × 3
  species   mean_bill_length_mm mean_bill_depth_mm
  <fct>                   <dbl>              <dbl>
1 Adelie                   38.8               18.3
2 Chinstrap                48.8               18.4
3 Gentoo                   47.6               15.0
  1. Use .names to add units/formatting when mutating multiple columns
penguins_clean %>%
    mutate(across(starts_with("bill_"),
                                ~ .x / 10,
                                .names = "{.col}_cm")) %>%   # e.g. bill_length_mm_cm
    select(species, ends_with("_cm")) %>%
    slice_head(n = 6)
# A tibble: 6 × 3
  species bill_length_mm_cm bill_depth_mm_cm
  <fct>               <dbl>            <dbl>
1 Adelie               3.91             1.87
2 Adelie               3.95             1.74
3 Adelie               4.03             1.8 
4 Adelie               3.67             1.93
5 Adelie               3.93             2.06
6 Adelie               3.89             1.78

Notes - across() replaced many uses of mutate_at()/summarise_at() and is the recommended tidyverse approach. - where() is handy when you want to target all numeric or character columns without naming them explicitly. - Use .names in across() to control output column naming and avoid collisions.