Combining Data

Joining examples with palmerpenguins

library(tidyverse)
library(palmerpenguins)

# prepare a clean table for examples
penguins_clean <- penguins %>%
  select(species, island, year, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g) %>%
  drop_na(species)

Example A — inner_join with a small metadata table

# create a small metadata table that intentionally omits one species (to show filtering)
species_meta <- tibble(
  species = c("Adelie", "Gentoo"),       # note: "Chinstrap" is omitted
  conservation_note = c("stable", "monitor")
)

# inner_join keeps only rows that have a match in species_meta (Adelie + Gentoo only)
penguins_inner <- penguins_clean %>%
  inner_join(species_meta, by = "species")

# inspect result counts per species (should not include Chinstrap)
penguins_inner %>% count(species)
# A tibble: 2 × 2
  species     n
  <chr>   <int>
1 Adelie    152
2 Gentoo    124

Example B — left_join to attach island-level summaries

# compute island-level average body mass
island_summary <- penguins_clean %>%
  group_by(island) %>%
  summarise(island_mean_mass = mean(body_mass_g, na.rm = TRUE), .groups = "drop")

# left_join preserves all rows from penguins_clean and adds island_mean_mass
penguins_with_island <- penguins_clean %>%
  left_join(island_summary, by = "island")

# verify: each row has island_mean_mass and original rows are preserved
penguins_with_island %>% slice_head(n = 6)
# A tibble: 6 × 8
  species island     year bill_length_mm bill_depth_mm flipper_length_mm
  <fct>   <fct>     <int>          <dbl>         <dbl>             <int>
1 Adelie  Torgersen  2007           39.1          18.7               181
2 Adelie  Torgersen  2007           39.5          17.4               186
3 Adelie  Torgersen  2007           40.3          18                 195
4 Adelie  Torgersen  2007           NA            NA                  NA
5 Adelie  Torgersen  2007           36.7          19.3               193
6 Adelie  Torgersen  2007           39.3          20.6               190
# ℹ 2 more variables: body_mass_g <int>, island_mean_mass <dbl>