library(tidyverse)
library(palmerpenguins)
# minimal clean dataset for examples
penguins_clean <- penguins %>%
select(species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex) %>%
drop_na()Data Wrangling
In wrangling we’re looking mostly at functions that come from {dplyr}
dplyr refresher with palmerpenguins
This short refresher demonstrates select, filter, mutate, group_by, and summarise using the palmerpenguins dataset.
select — keep specific columns
penguins_clean %>%
select(species, island, body_mass_g) %>%
slice_head(n = 6)# A tibble: 6 × 3
species island body_mass_g
<fct> <fct> <int>
1 Adelie Torgersen 3750
2 Adelie Torgersen 3800
3 Adelie Torgersen 3250
4 Adelie Torgersen 3450
5 Adelie Torgersen 3650
6 Adelie Torgersen 3625
filter — keep rows matching conditions
# filter by species and body mass > 4000 g
penguins_clean %>%
filter(species == "Adelie", body_mass_g > 4000) %>%
slice_head(n = 6)# A tibble: 6 × 7
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.2 19.6 195 4675
2 Adelie Torgersen 34.6 21.1 198 4400
3 Adelie Torgersen 42.5 20.7 197 4500
4 Adelie Torgersen 46 21.5 194 4200
5 Adelie Dream 39.2 21.1 196 4150
6 Adelie Dream 39.8 19.1 184 4650
# ℹ 1 more variable: sex <fct>
mutate — add or transform columns
# add body mass in kg and bill ratio
penguins_clean %>%
mutate(
body_mass_kg = body_mass_g / 1000,
bill_ratio = bill_length_mm / bill_depth_mm
) %>%
slice_head(n = 6)# A tibble: 6 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
# ℹ 3 more variables: sex <fct>, body_mass_kg <dbl>, bill_ratio <dbl>
group_by + summarise — aggregate by groups
# mean body mass and median flipper length per species
penguins_clean %>%
group_by(species) %>%
summarise(
n = n(),
mean_body_mass = mean(body_mass_g, na.rm = TRUE),
median_flipper = median(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_body_mass))# A tibble: 3 × 4
species n mean_body_mass median_flipper
<fct> <int> <dbl> <dbl>
1 Gentoo 119 5092. 216
2 Chinstrap 68 3733. 196
3 Adelie 146 3706. 190