Data Wrangling

In wrangling we’re looking mostly at functions that come from {dplyr}

dplyr refresher with palmerpenguins

This short refresher demonstrates select, filter, mutate, group_by, and summarise using the palmerpenguins dataset.

library(tidyverse)
library(palmerpenguins)

# minimal clean dataset for examples
penguins_clean <- penguins %>%
  select(species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex) %>%
  drop_na()

select — keep specific columns

penguins_clean %>%
  select(species, island, body_mass_g) %>%
  slice_head(n = 6)
# A tibble: 6 × 3
  species island    body_mass_g
  <fct>   <fct>           <int>
1 Adelie  Torgersen        3750
2 Adelie  Torgersen        3800
3 Adelie  Torgersen        3250
4 Adelie  Torgersen        3450
5 Adelie  Torgersen        3650
6 Adelie  Torgersen        3625

filter — keep rows matching conditions

# filter by species and body mass > 4000 g
penguins_clean %>%
  filter(species == "Adelie", body_mass_g > 4000) %>%
  slice_head(n = 6)
# A tibble: 6 × 7
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.2          19.6               195        4675
2 Adelie  Torgersen           34.6          21.1               198        4400
3 Adelie  Torgersen           42.5          20.7               197        4500
4 Adelie  Torgersen           46            21.5               194        4200
5 Adelie  Dream               39.2          21.1               196        4150
6 Adelie  Dream               39.8          19.1               184        4650
# ℹ 1 more variable: sex <fct>

mutate — add or transform columns

# add body mass in kg and bill ratio
penguins_clean %>%
  mutate(
    body_mass_kg = body_mass_g / 1000,
    bill_ratio = bill_length_mm / bill_depth_mm
  ) %>%
  slice_head(n = 6)
# A tibble: 6 × 9
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           36.7          19.3               193        3450
5 Adelie  Torgersen           39.3          20.6               190        3650
6 Adelie  Torgersen           38.9          17.8               181        3625
# ℹ 3 more variables: sex <fct>, body_mass_kg <dbl>, bill_ratio <dbl>

group_by + summarise — aggregate by groups

# mean body mass and median flipper length per species
penguins_clean %>%
  group_by(species) %>%
  summarise(
    n = n(),
    mean_body_mass = mean(body_mass_g, na.rm = TRUE),
    median_flipper = median(flipper_length_mm, na.rm = TRUE)
  ) %>%
  arrange(desc(mean_body_mass))
# A tibble: 3 × 4
  species       n mean_body_mass median_flipper
  <fct>     <int>          <dbl>          <dbl>
1 Gentoo      119          5092.            216
2 Chinstrap    68          3733.            196
3 Adelie      146          3706.            190