Lecture 32
College of Idaho
CSCI 2025 - Winter 2026




new_data <- data |>
mutate(new_x = x + y + 2*z,
new_y = x - y,
new_x = new_x/6, #re-normalizing
new_y = new_y/2)
new_data |> head() |> kable()| x | y | z | new_x | new_y |
|---|---|---|---|---|
| -0.5604756 | -0.9957987 | -1.5562744 | -0.7781372 | 0.2176615 |
| -0.2301775 | -1.0399550 | -1.2701325 | -0.6350663 | 0.4048888 |
| 1.5587083 | -0.0179802 | 1.5407281 | 0.7703640 | 0.7883443 |
| 0.0705084 | -0.1321751 | -0.0616667 | -0.0308334 | 0.1013418 |
| 0.1292877 | -2.5493428 | -2.4200550 | -1.2100275 | 1.3393153 |
| 1.7150650 | 1.0405735 | 2.7556384 | 1.3778192 | 0.3372458 |
| name | value | proportion |
|---|---|---|
| var1 | 0.9834589 | 0.9391552 |
| var2 | 0.0637151 | 0.0608448 |
step_pca(): Computes principal components.step_dummy() for categorical.step_normalize() (Variance depends on scale!).library(tidymodels)
library(palmerpenguins)
library(tidyverse)
# Define Recipe
pca_rec <- recipe(~ ., data = penguins) |>
step_rm(year, sex, island) |> # Remove non-measurements
step_naomit(all_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_pca(all_numeric_predictors()) # Keep top 2 PCs
# Prep and Bake
pca_prep <- prep(pca_rec)
pca_data <- bake(pca_prep, new_data = NULL)
head(pca_data)# A tibble: 6 × 5
species PC1 PC2 PC3 PC4
<fct> <dbl> <dbl> <dbl> <dbl>
1 Adelie -1.84 -0.0476 0.232 0.523
2 Adelie -1.30 0.428 0.0295 0.402
3 Adelie -1.37 0.154 -0.198 -0.527
4 Adelie -1.88 0.00205 0.618 -0.478
5 Adelie -1.91 -0.828 0.686 -0.207
6 Adelie -1.76 0.351 -0.0276 0.504
# Since we removed 'species' in the recipe to avoid using it in PCA,
# we might want to bind it back for plotting coloring.
# A better workflow is to keep it as an ID/Role but not utilize it in step_pca.
pca_rec_2 <- recipe(species ~ ., data = penguins) |>
step_rm(year, sex, island) |>
step_naomit(all_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_pca(all_numeric_predictors(), num_comp = 2)
# Default behavior: step_pca ignores 'outcome' variables, which is handy!
pca_prep_2 <- prep(pca_rec_2)
pca_juice <- juice(pca_prep_2) # juice() is a shortcut for bake(prep, new_data=NULL)
ggplot(pca_juice, aes(x = PC1, y = PC2, color = species)) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = "Penguins PCA", subtitle = "Species separate well in PC space")tidy().pca_comps <- tidy(pca_prep_2, number = 2, type = "coef") # number=2 refers to the step number index if unknown, typically easier to look up.
# Better way: ID the step
pca_rec_named <- recipe(species ~ ., data = penguins) |>
step_naomit(all_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_pca(all_numeric_predictors(), id = "pca")
pca_prep_named <- prep(pca_rec_named)
pca_comps <- tidy(pca_prep_named, id = "pca")
pca_comps |>
filter(component %in% c("PC1", "PC2")) |>
ggplot(aes(x = value, y = terms, fill = terms)) +
geom_col() +
facet_wrap(~component) +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "PCA Loadings: Contribution of variables")# Extract variance explained
pca_var <- tidy(pca_prep_named, id = "pca", type = "variance")
pca_var |>
filter(terms == "percent variance") |>
ggplot(aes(x = component, y = value)) +
geom_col(fill = "steelblue") +
geom_line(group = 1) +
geom_point() +
labs(title = "Scree Plot", y = "% Variance Explained") +
theme_minimal()step_normalize() (Crucial!)step_pca()