3. Visual encoding

Author

Camille Seaberry

Modified

February 15, 2024

This is a walkthrough of Wickham et al. (2023) chapter 9 on chart layers, using the ACS data in the justviz package. For simplicity, we’ll focus on Maryland census tracts. I’m throwing in a few additional variables just to match the examples from the book.

Wickham, H., Çetinkaya-Rundel, M., & Grolemund, G. (2023). R for data science: Import, tidy, transform, visualize, and model data (2nd edition). O’Reilly. https://r4ds.hadley.nz/
library(dplyr)
library(ggplot2)
library(justviz)
# create a variable that flags tracts being in the city or surrounding counties. 
# other values get lumped into "other counties" group
local_counties <- c("Baltimore city", "Baltimore County", "Anne Arundel County", "Howard County")
acs_tr <- acs |>
  filter(level == "tract") |>
  mutate(county2 = ifelse(county %in% local_counties, county, "Other counties")) |>
  na.omit() |> # we'll talk about missing data in the next notebook
  mutate(income_brk = cut(median_hh_income, 
                          breaks = c(0, 1e5, Inf), 
                          labels = c("under_100k", "above_100k"),
                          include.lowest = TRUE, right = FALSE))
ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(size = 1)

Aesthetic mappings

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(color = county2), size = 1)

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(shape = county2), size = 1)

As noted in the book, these are bad ideas:

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(size = county2), alpha = 0.5)

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(alpha = county2), size = 1)

Can you think of any exceptions to this?


What’s going on with the next two charts?

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(color = "slateblue")

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(color = "slateblue"))

Why does this one throw an error?

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(color = county2)

Geometric objects

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(size = 1)

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_smooth()

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden, color = county2)) +
  geom_point(size = 1) +
  geom_smooth()

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_smooth()

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_smooth(aes(group = county2))

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_smooth(aes(color = county2), show.legend = FALSE)

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(color = county2), size = 1) +
  geom_smooth()

I don’t like how they did this highlighting example in the book. Here’s a better one.

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(aes(color = county == "Baltimore city")) +
  scale_color_manual(values = c("TRUE" = "firebrick", "FALSE" = "gray60"))

ggplot(acs_tr, aes(x = homeownership, y = county2, fill = county2, color = county2)) +
  ggridges::geom_density_ridges(alpha = 0.5, show.legend = FALSE)

Facets

acs_tr |>
  ggplot(aes(x = homeownership, y = total_cost_burden)) +
  geom_point(size = 1) +
  facet_grid(cols = vars(county2), rows = vars(income_brk))

Statistical transformations

I am of the opinion that if you want to visualize summary statistics or other aggregations, you should calculate them explicitly, not let ggplot do them ad hoc, so I think the examples in section 9.5 are not great. Comparable charts with calculations:

acs_tr |>
  group_by(county2) |>
  summarise(n = n()) |> # these 2 steps can be done with `count`
  ggplot(aes(x = county2, y = n)) +
  geom_col()

acs_tr |>
  group_by(county2) |>
  summarise(n = n()) |> # keeping data grouped by county2 lets you calc proportions
  mutate(prop = n / sum(n)) |>
  ggplot(aes(x = county2, y = prop)) +
  geom_col()

acs_tr |>
  group_by(county2) |>
  summarise(across(total_cost_burden, list(min = min, max = max, median = median))) |>
  ggplot(aes(x = county2)) +
  geom_pointrange(aes(y = total_cost_burden_median, 
                     ymin = total_cost_burden_min, 
                     ymax = total_cost_burden_max))

Position aesthetics

inc_by_county <- acs_tr |>
  group_by(county2, income_brk) |>
  summarise(n = n())

ggplot(inc_by_county, aes(x = county2, y = n, color = income_brk)) +
  geom_col()

ggplot(inc_by_county, aes(x = county2, y = n, fill = income_brk)) +
  geom_col()

ggplot(inc_by_county, aes(x = county2, y = n, fill = income_brk)) +
  geom_col(alpha = 1/5, position = position_identity())

ggplot(inc_by_county, aes(x = county2, y = n, fill = income_brk)) +
  geom_col(position = position_fill())

ggplot(inc_by_county, aes(x = county2, y = n, fill = income_brk)) +
  geom_col(position = position_dodge())

ggplot(inc_by_county, aes(x = county2, y = n, fill = income_brk)) +
  geom_col(position = position_dodge2())

Other than the first chart with the weird opacity, which kinda sucks, these give you different views of the same data. What can you pick up from each?

ggplot(acs_tr, aes(x = homeownership, y = total_cost_burden)) +
  geom_point(size = 1, position = position_jitter(seed = 1))

Back to top