Example 05: Tables and dplyr

Crosstabs

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(socviz)

gss_sm

# A tibble: 2,867 × 32
    year    id ballot       age childs sibs   degree race  sex   region income16
   <dbl> <dbl> <labelled> <dbl>  <dbl> <labe> <fct>  <fct> <fct> <fct>  <fct>   
 1  2016     1 1             47      3 2      Bache… White Male  New E… $170000…
 2  2016     2 2             61      0 3      High … White Male  New E… $50000 …
 3  2016     3 3             72      2 3      Bache… White Male  New E… $75000 …
 4  2016     4 1             43      4 3      High … White Fema… New E… $170000…
 5  2016     5 3             55      2 2      Gradu… White Fema… New E… $170000…
 6  2016     6 2             53      2 2      Junio… White Fema… New E… $60000 …
 7  2016     7 1             50      2 2      High … White Male  New E… $170000…
 8  2016     8 3             23      3 6      High … Other Fema… Middl… $30000 …
 9  2016     9 1             45      3 5      High … Black Male  Middl… $60000 …
10  2016    10 3             71      4 1      Junio… White Male  Middl… $60000 …
# ℹ 2,857 more rows
# ℹ 21 more variables: relig <fct>, marital <fct>, padeg <fct>, madeg <fct>,
#   partyid <fct>, polviews <fct>, happy <fct>, partners <fct>, grass <fct>,
#   zodiac <fct>, pres12 <labelled>, wtssall <dbl>, income_rc <fct>,
#   agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>, religion <fct>,
#   bigregion <fct>, partners_rc <fct>, obama <dbl>

Count up one variable:

gss_sm |> 
  group_by(bigregion) |> 
  tally()

# A tibble: 4 × 2
  bigregion     n
  <fct>     <int>
1 Northeast   488
2 Midwest     695
3 South      1052
4 West        632

Cross-tabulate:

gss_sm |> 
  group_by(bigregion, religion) |> 
  tally()

# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# ℹ 14 more rows

Notice the difference:

gss_sm |> 
  group_by(religion, bigregion) |> 
  tally()

# A tibble: 24 × 3
# Groups:   religion [6]
   religion   bigregion     n
   <fct>      <fct>     <int>
 1 Protestant Northeast   158
 2 Protestant Midwest     325
 3 Protestant South       650
 4 Protestant West        238
 5 Catholic   Northeast   162
 6 Catholic   Midwest     172
 7 Catholic   South       160
 8 Catholic   West        155
 9 Jewish     Northeast    27
10 Jewish     Midwest       3
# ℹ 14 more rows

Seems similar, but now try:

# Religion within bigregion;
gss_sm |> 
  group_by(bigregion, religion) |> 
  tally() |> 
  summarize(group_total = sum(n)) |> 
  mutate(prop = group_total/sum(group_total))

# A tibble: 4 × 3
  bigregion group_total  prop
  <fct>           <int> <dbl>
1 Northeast         488 0.170
2 Midwest           695 0.242
3 South            1052 0.367
4 West              632 0.220

# Bigregion within religion
gss_sm |> 
  group_by(religion, bigregion) |> 
  tally() |> 
  summarize(group_total = sum(n)) |> 
  mutate(prop = group_total/sum(group_total))

# A tibble: 6 × 3
  religion   group_total    prop
  <fct>            <int>   <dbl>
1 Protestant        1371 0.478  
2 Catholic           649 0.226  
3 Jewish              51 0.0178 
4 None               619 0.216  
5 Other              159 0.0555 
6 <NA>                18 0.00628

To get our table to look like a conventional nxm crosstab, pivot it:

gss_sm |> 
  group_by(bigregion, religion) |> 
  tally() |> 
  pivot_wider(names_from = religion, values_from = n)

# A tibble: 4 × 7
# Groups:   bigregion [4]
  bigregion Protestant Catholic Jewish  None Other  `NA`
  <fct>          <int>    <int>  <int> <int> <int> <int>
1 Northeast        158      162     27   112    28     1
2 Midwest          325      172      3   157    33     5
3 South            650      160     11   170    50    11
4 West             238      155     10   180    48     1

# Bigregion within religion
gss_sm |> 
  group_by(religion, bigregion) |> 
  tally() |> 
  summarize(group_total = sum(n)) |> 
  mutate(prop = group_total/sum(group_total))

# A tibble: 6 × 3
  religion   group_total    prop
  <fct>            <int>   <dbl>
1 Protestant        1371 0.478  
2 Catholic           649 0.226  
3 Jewish              51 0.0178 
4 None               619 0.216  
5 Other              159 0.0555 
6 <NA>                18 0.00628

Grouped and counted by region within religion:

gss_sm |> 
  group_by(religion, bigregion) |> 
  tally() |> 
  pivot_wider(names_from = bigregion, values_from = n)

# A tibble: 6 × 5
# Groups:   religion [6]
  religion   Northeast Midwest South  West
  <fct>          <int>   <int> <int> <int>
1 Protestant       158     325   650   238
2 Catholic         162     172   160   155
3 Jewish            27       3    11    10
4 None             112     157   170   180
5 Other             28      33    50    48
6 <NA>               1       5    11     1

As many dimensions as we wish:

gss_sm |> 
  group_by(bigregion, race, sex) |> 
  tally()

# A tibble: 24 × 4
# Groups:   bigregion, race [12]
   bigregion race  sex        n
   <fct>     <fct> <fct>  <int>
 1 Northeast White Male     165
 2 Northeast White Female   217
 3 Northeast Black Male      24
 4 Northeast Black Female    36
 5 Northeast Other Male      15
 6 Northeast Other Female    31
 7 Midwest   White Male     274
 8 Midwest   White Female   285
 9 Midwest   Black Male      43
10 Midwest   Black Female    46
# ℹ 14 more rows

Equivalently, but the result is not grouped:

gss_sm |> 
  count(bigregion, race, sex)

# A tibble: 24 × 4
   bigregion race  sex        n
   <fct>     <fct> <fct>  <int>
 1 Northeast White Male     165
 2 Northeast White Female   217
 3 Northeast Black Male      24
 4 Northeast Black Female    36
 5 Northeast Other Male      15
 6 Northeast Other Female    31
 7 Midwest   White Male     274
 8 Midwest   White Female   285
 9 Midwest   Black Male      43
10 Midwest   Black Female    46
# ℹ 14 more rows

Add a frequency column:

gss_sm |> 
  group_by(bigregion, race, sex) |> 
  tally() |> 
  mutate(prop = n/sum(n))

# A tibble: 24 × 5
# Groups:   bigregion, race [12]
   bigregion race  sex        n  prop
   <fct>     <fct> <fct>  <int> <dbl>
 1 Northeast White Male     165 0.432
 2 Northeast White Female   217 0.568
 3 Northeast Black Male      24 0.4  
 4 Northeast Black Female    36 0.6  
 5 Northeast Other Male      15 0.326
 6 Northeast Other Female    31 0.674
 7 Midwest   White Male     274 0.490
 8 Midwest   White Female   285 0.510
 9 Midwest   Black Male      43 0.483
10 Midwest   Black Female    46 0.517
# ℹ 14 more rows

When the result is not grouped, what do you get as the proportions?

gss_sm |> 
  count(bigregion, race, sex) |> 
  mutate(prop = n/sum(n))

# A tibble: 24 × 5
   bigregion race  sex        n    prop
   <fct>     <fct> <fct>  <int>   <dbl>
 1 Northeast White Male     165 0.0576 
 2 Northeast White Female   217 0.0757 
 3 Northeast Black Male      24 0.00837
 4 Northeast Black Female    36 0.0126 
 5 Northeast Other Male      15 0.00523
 6 Northeast Other Female    31 0.0108 
 7 Midwest   White Male     274 0.0956 
 8 Midwest   White Female   285 0.0994 
 9 Midwest   Black Male      43 0.0150 
10 Midwest   Black Female    46 0.0160 
# ℹ 14 more rows

Check your work by summing the rows.