Debois Project

A Lot of this is an absolute mess. Not many Comments. Just figured I should post it anyways.

1 Database load and setup

Used a local postgressql database to fit full data in memory. This code is mostly just importing it into the database, loading it, etc.

# Load Database
conn <- 
    dbConnect(
        drv = Postgres(),
        user = 'lizsql', 
        password = 'lizsql', 
        host = "localhost",
        port = "5432",
        dbname = "postgres"
    )
debois_1900 <- tbl(conn, in_schema('public', 'debois_1900'))
acs_2019_5yr <- tbl(conn, in_schema('public', 'acs_2019_5yr_v2'))
# Reads into Postgres. Replace variable and table names for each dataset. 
# Had to use postgres as full 1900 census would not fit in memory. 
cps_ddi_file <- "usa_00009.xml"
cps_data_file <- "usa_00009.dat"
# Add data to tables in chunks
ddi <- read_ipums_ddi(cps_ddi_file)
read_ipums_micro_chunked(
  ddi,
  data_file = cps_data_file,
  readr::SideEffectChunkCallback$new(function(x, pos) {
    if (pos == 1) {
      dbWriteTable(conn, "acs_2019_5yr_v2", x)
    } else {
      dbWriteTable(conn, "acs_2019_5yr_v2", x, row.names = FALSE, append = TRUE)
    }
  }),
  chunk_size = 30000,
  verbose = FALSE
)
s
f <- debois_1900 |> 
    filter(STATEFIP == 13, RACE == 2) |> 
    count(BPL) |> 
    ipums_collect(ddi)

f2 <- debois_1900 |> 
    filter(BPL == 13, RACE == 2) |> 
    count(STATEFIP) |> 
    ipums_collect(ddi)

2 Migration Chart

This is very messy. But basically involved filtering out various parts of the dataset and then creating the different figures.

migration_to_georgia <- f |> mutate(region = labelled::to_factor(BPL), FIPS = labelled::remove_labels(BPL)) |> select(region, FIPS, n)
saveRDS(migration_to_georgia, file = "migration_to_georgia")
migration_from_georgia <- f2 |> mutate(region = labelled::to_factor(STATEFIP), FIPS = labelled::remove_labels(STATEFIP)) |> 
    select(region, FIPS, n)
#saveRDS(migration_from_georgia, file = 'migration_from_georgia')
fixed_numbers <-
    migration_from_georgia %>% filter(FIPS < 60) %>% mutate(region = tolower(region)) %>% filter(region != 'district of columbia', FIPS != 13) %>% mutate(n = as.numeric(n)) %>% select(region, n)
states_map <- map_data('state')
total_map <- left_join(states_map, fixed_numbers, by = 'region')
state_name <-
    data.frame(
        abb = state.abb,
        region = tolower(state.name),
        x = state.center$x,
        y = state.center$y
    ) |> filter(abb != 'HI', abb != 'AK') |>
    left_join(fixed_numbers) #,
                                           #           TRUE ~ n)) 
from_ga <- ggplot(total_map, aes(long, lat, group = group)) +
    #    annotation_map_tile("stamenwatercolor") +
    geom_polygon(aes(fill = n), color = "white") +
    scale_fill_viridis_c(option = "C",
                         trans = 'log',
                         direction = -1) + geom_shadowtext(data = state_name,
                                                           aes(
                                                               x = x,
                                                               y = y,
                                                               label = ifelse(abb == 'GA', "GA\n ", paste0(abb,'\n', ifelse(is.na(n), 0, n))),
                                                               group = 1
                                                           ),
                                                           size = 2) +
    #  annotation_scale() +
 #   theme_void() +
    theme(axis.line=element_blank(),axis.text.x=element_blank(),
          axis.text.y=element_blank(),axis.ticks=element_blank(),
          axis.title.x=element_blank(), legend.position = 'none',
          axis.title.y=element_blank(),
          panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),plot.background=element_blank()) +
     labs(
        color = 'Population',
        title = 'Populated Migrated from Georgia',
        subtitle = 'Population born in Georgia by current state of residence (if living outside Georgia)'
    ) 
   # theme(plot.title = element_text(hjust = 0.5))
#theme(legend.position="none")
from_ga
ggsave(
    'from_ga.png',
    from_ga,
    width = 2560,
    height = 1440,
    units = c('px')
)


fixed_numbers <-
    migration_to_georgia %>% filter(FIPS < 60) %>% mutate(region = tolower(region)) %>% filter(region != 'district of columbia', FIPS != 13) %>% mutate(n = as.numeric(n)) %>% select(region, n)
states_map <- map_data('state')
total_map <- left_join(states_map, fixed_numbers, by = 'region')
state_name <-
    data.frame(
        abb = state.abb,
        region = tolower(state.name),
        x = state.center$x,
        y = state.center$y
    ) |> filter(abb != 'HI', abb != 'AK') |>
    left_join(fixed_numbers)# %>% mutate(n = case_when(abb == 'GA' ~ 958984,
                             #                         TRUE ~ n))
to_ga <- ggplot(total_map, aes(long, lat, group = group)) +
    #    annotation_map_tile("stamenwatercolor") +
    geom_polygon(aes(fill = n), color = "white") +
    scale_fill_viridis_c(option = "C",
                         trans = 'log',
                         direction = -1) + geom_shadowtext(data = state_name,
                                                           aes(
                                                               x = x,
                                                               y = y,
                                                               label =  ifelse(abb == 'GA', "GA\n ", paste0(abb,'\n', ifelse(is.na(n), 0, n))),
                                                               group = 1
                                                           ),
                                                           size = 2) +
    #  annotation_scale() +
   # theme_void() +
    #theme(plot.title = element_text(hjust = 0.5)) + 
    labs(
        color = 'Population',
        title = 'Population Migrated to Georgia', 
        subtitle = 'Number of current Georgia residents by place of birth (if born outside georgia)'
    ) +
      theme(axis.line=element_blank(),axis.text.x=element_blank(),
          axis.text.y=element_blank(),axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(), legend.position = 'none',
          panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),plot.background=element_blank())
        #  plot.margin = margin())
#theme(legend.position="none")
to_ga
ggsave(
    'to_ga.png',
    to_ga,
    width = 2560,
    height = 1440,
    units = c('px')
)

patchwork = to_ga / from_ga
patchwork <- patchwork + 
    plot_annotation(
    theme = theme(legend.position = 'none',
                  plot.title = element_text(size = 20),
                  plot.margin = margin(t = 25, b = 25, l = 25, r = 25)),
    title = 'Migration of African Americans to and from Georgia, 1900',
    subtitle = 'Residents of Georgia born in another state, and residents of other states born in Georgia\n ',
    caption = 'Created by Elizabeth Goodwin using the full 1900 Census, IPUMS USA'
    # subtitle = 'These 3 plots will reveal yet-untold secrets about our beloved data-set',
    #   caption = 'Disclaimer: None of these plots are insightful',
  #  tag_levels = c('A', '1'),
  #  tag_prefix = 'Fig. ',
 #   tag_sep = '.',
  #  tag_suffix = ':'
)  
patchwork

filename = 'lot-11931-no-08-GOODWIN.pdf'
filenamepng = 'lot-11931-no-08-GOODWIN.png'
ggsave(filename, 
       plot = patchwork,
       dpi = 300,
       height = 11,
       width = 8.5
       )

3 Marital Status Figure

density <- acs_2019_5yr |> 
    filter(RACE == 1) |> 
    mutate(
        age_bucket = case_when(
            
            AGE < 16 ~ "0-15",
            AGE < 21 ~ "15-20",
            AGE < 26 ~ "20-25",
            AGE < 31 ~ "25-30",
            AGE < 36 ~ "30-35",
            AGE < 46 ~ "35-45",
            AGE < 56 ~ "45-55",
            AGE < 66 ~ "55-65",
            AGE < 76 ~ "65-75",
            AGE < 89 ~ "Over 75",
            TRUE ~ 'other'
        ), 
        Marital_Status = case_when(
            MARST < 3 ~ 'Married', 
            MARST < 5 ~ 'Separated/\nDivorced', 
            MARST == 5 ~ "Widowed", 
            MARST == 6 ~ "Single"
        )
    ) |> 
    count(SEX, age_bucket, Marital_Status) |> 
    ipums_collect(ddi) |> 
    filter(age_bucket != 'other') |> 
    add_count(SEX, age_bucket, wt = n, name = 'total') |> 
    mutate(pct = 100*n / total) |> 
    mutate(SEX = case_when(SEX == 1 ~ 'Male', TRUE ~ "Female")) |> 
    mutate(Marital_Status = factor(Marital_Status, levels = rev(c('Single', 'Married', "Separated/\nDivorced", 'Widowed'))))
to_swap = rev(c("#9A8A76", "#db735c", "#EFA86E","#555555" ))
male <-
    ggplot((density |> filter(SEX == 'Male')),
           aes(x = age_bucket, y = as.numeric(pct), fill = Marital_Status)) +
    geom_bar(stat = "identity", width = 1, color = 'black', size = .3) + 
    coord_flip() + 
    
    labs(subtitle = 'Male', 
         y = element_blank(), 
  fill = "Status",
         x = "Age (Years)") + 
  #  hrbrthemes::theme_ipsum_ps() + 
    theme(legend.position = 'bottom'
  #      plot.subtitle = element_text(hjust = 0.5)
          ) + 
    geom_shadowtext(aes(label = ifelse(round(pct) > 3, paste0(round(pct), "%"), "")), size = 2.8,position = position_stack(vjust = .5)) + 
    scale_y_reverse(limits=c(101,0), labels = scales::percent_format(scale = 1), expand = expansion(mult = c(.05,.025))) +
    scale_fill_manual(values = to_swap)
    

female <-
    ggplot((density |> filter(SEX == 'Female')),
           aes(
               x = age_bucket,
               y = as.numeric(pct),
               fill = Marital_Status
           )) +
    geom_bar(stat = "identity", width = 1, color = 'black', size = .3) + 
    coord_flip() + 

 

    labs(subtitle = 'Female',
  fill = "Status",
  y = element_blank()
        ) + 
  #  hrbrthemes::theme_ipsum_ps() + 
    
  geom_shadowtext(aes(label = ifelse(round(pct) > 3, paste0(round(pct), "%"), "")), size = 2.8,position = position_stack(vjust = .5)) + 
    scale_y_continuous(labels = scales::percent_format(scale = 1),  expand = expansion(mult = c(.025,0.05))) +

  coord_flip() + #+ coord_flip() +  scale_y_reverse(limits=c(100,-100))+
    
  theme(
        legend.position = 'none',
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
     #   plot.subtitle = element_text(hjust = 0.5),
     plot.margin = margin(l = 0)
    )  + scale_fill_manual(values = to_swap)


combined <- male + female & theme(legend.position = "right", legend.text=element_text(size=8))  
combined <- combined + plot_layout(guides = "collect") + plot_annotation(
  title = 'Marital Status of African Americans',
  subtitle = 'By Age and Sex, 2015-2019',
  caption = element_text('Made by Elizabeth Goodwin\n Source: 2015-2019 ACS, IPUMS USA', size = 8)) +
  plot_annotation(theme = theme(plot.margin = margin(r = 15, l = 15, t = 20, b = 20, unit = 'pt')))

combined
ggsave('lot-11931-no-53-GOODWIN.pdf',combined, dpi = 400, height = 5.83, width = 9)
#ggsave('lot-11931-no-53-GOODWIN.png',combined, dpi = 400, height = 5.83, width = 9)

4 Employment Figure

acs_occ_pct <- acs_2019_5yr %>% count(OCC2010, RACE, wt = PERWT) |> add_count(OCC2010, wt = n, name = 'total') |>  mutate(pct = 100*n/total) |> ipums_collect(ddi)
acs_occ <- acs_occ_pct |> 
    mutate(OCC2010 = as.numeric(OCC2010),  
    occ = case_when(
        ((OCC2010 >= 10) & (OCC2010 <= 430)) ~ 'Management, Business, Science, and Arts',
        ((OCC2010 >= 500 ) & (OCC2010 <= 730)) ~ 'Business Operations Specialists',
        ((OCC2010 >= 800) & (OCC2010 <= 950)) ~ 'Financial Specialists',
        ((OCC2010 >= 1000) & (OCC2010 <= 1240)) ~ 'Computer and Mathematical',
        ((OCC2010 >= 1300) & (OCC2010 <= 1540)) ~ 'Architecture and Engineering',
        ((OCC2010 >= 1550) & (OCC2010 <= 1560)) ~ 'Technicians',
        ((OCC2010 >= 1600) & (OCC2010 <= 1980)) ~ 'Life, Physical, and Social Science',
        ((OCC2010 >= 2000) & (OCC2010 <= 2060)) ~ 'Community and Social Services',
        ((OCC2010 >= 2100) & (OCC2010 <= 2150)) ~ 'Legal',
        ((OCC2010 >= 2200) & (OCC2010 <= 2550)) ~ 'Education, Training, and Library',
        ((OCC2010 >= 2600) & (OCC2010 <= 2920)) ~ 'Arts, Design, Entertainment, Sports, and Media',
        ((OCC2010 >= 3000) & (OCC2010 <= 3540)) ~ 'Healthcare Practitioners and Technicians',
        ((OCC2010 >= 3600) & (OCC2010 <= 3650)) ~ 'Healthcare Support',
        ((OCC2010 >= 3700) & (OCC2010 <= 3950)) ~ 'Protective Service',
        ((OCC2010 >= 4000) & (OCC2010 <= 4150)) ~ 'Food Preparation and Serving',
        ((OCC2010 >= 4200) & (OCC2010 <= 4250)) ~ 'Building and Grounds Cleaning and Maintenance',
        ((OCC2010 >= 4300) & (OCC2010 <= 4650)) ~ 'Personal Care and Service',
        ((OCC2010 >= 4700) & (OCC2010 <= 4965)) ~ 'Sales and Related',
        ((OCC2010 >= 5000) & (OCC2010 <= 5940)) ~ 'Office and Administrative Support',
        ((OCC2010 >= 6005) & (OCC2010 <= 6130)) ~ 'Farming, Fishing, and Forestry',
        ((OCC2010 >= 6200) & (OCC2010 <= 6765)) ~ 'Construction',
        ((OCC2010 >= 6800) & (OCC2010 <= 6940)) ~ 'Extraction',
        ((OCC2010 >= 7000) & (OCC2010 <= 7630)) ~ 'Installation, Maintenance, and Repair',
        ((OCC2010 >= 7700) & (OCC2010 <= 8965)) ~ 'Production',
        ((OCC2010 >= 9000) & (OCC2010 <= 9750)) ~ 'Transportation and Material Moving',
        ((OCC2010 >= 9800) & (OCC2010 <= 9830)) ~ 'Military Specific',
        ((OCC2010 >= 9920) & (OCC2010 <= 9920)) ~ 'Unemployed for 5+ years or Never Worked',
        TRUE ~ 'Other'),
    race = case_when(RACE == 1 ~ 'White', RACE == 2 ~ 'Black', TRUE ~ 'Other')
    ) |>
    ungroup() |> 
    select(occ, race, n, total, pct) |> 
    count(occ, race, wt = n) |> 
    add_count(occ, wt = n, name = 'total') |> 
    mutate(pct = 100*n/total) |> 
    filter(race != 'Other') |> 
    select(occ, race, pct, n) |>  
    pivot_wider(names_from = race, values_from = c(pct,n)) |> 
    mutate(avg_pct_black = weighted.mean(pct_Black, n_Black),
           diff_from_mean = pct_Black - avg_pct_black,
           occ = fct_reorder(occ, diff_from_mean),
           tot = sum(n_Black, n_White),
           ci = 196 * sqrt(((n_Black / tot) * (1 - (n_Black / tot))) / tot))
acs_occ_fig <- acs_occ %>% 
    ggplot(aes(x = pct_Black, y = occ, color = diff_from_mean)) +
    geom_point(size = 3) + 
    geom_vline(aes(xintercept = avg_pct_black), linetype = 2) + 
    scale_color_viridis() + 
    theme(legend.position = 'none',
           plot.title.position = "plot", 
    plot.caption.position = "plot",
    plot.margin = margin(r=25, l=25, t=25, b=10)
    ) + 
    labs(
        title = 'Percent African American by Employment Sector', 
        subtitle = 'Grouped by overall Employment Category. Dotted line is overall percent of population',
        x = "Percent African American",
        y = "Employment Sector",
        color = 'Difference from overall percent'
        
    ) 
acs_occ_fig
acs_occ_2 <- acs_occ_pct |> 
    mutate(occ = labelled::to_character(OCC2010),  
           #race = labelled::to_factor(RACE)
           race = case_when(RACE == 1 ~ 'White', RACE == 2 ~ 'Black', TRUE ~ 'Other')
    ) |>
    ungroup() |> 
    mutate(
        occ = case_when(occ == 'Postal Service Mail Sorters, Processors, and Processing Machine Operators' ~ "Postal Service Mail Sorters and Processors",
                        occ == "Farmers, Ranchers, and Other Agricultural Managers" ~ "Farmers and Ranchers",
                        occ == "Security Guards and Gaming Surveillance Officers" ~ "Security Guards",
                        TRUE ~ occ)) |> 
    select(occ, race, n, total, pct) |> 
    count(occ, race, wt = n) |> 
    add_count(occ, wt = n, name = 'total') |> 
    mutate(pct = 100*n/total) |> 
    filter(race != 'Other') |> 
    #add_count(race, wt = n, name = 'test') |> 
    #  filter(race < 3) |> 
    #  mutate(race = case_when(RACE == 1 ~ 'White', TRUE ~ 'Black'))|> 
    select(occ, race, pct, n) |>  
    pivot_wider(names_from = race, values_from = c(pct,n)) |> 
    mutate(avg_pct_black = weighted.mean(pct_Black, n_Black),
           diff_from_mean = pct_Black - avg_pct_black,
           occ = fct_reorder(occ, diff_from_mean),
           tot = sum(n_Black, n_White),
           ci = 196 * sqrt(((n_Black / tot) * (1 - (n_Black / tot))) / tot)) %>% mutate(type = case_when(diff_from_mean < 0 ~ 'Low', TRUE ~ 'High')) |> group_by(type) %>% 
    slice_min(desc(abs(diff_from_mean)), n= 8) %>%
    ungroup()

acs_occ_fig_2 <- acs_occ_2 %>% 
    ggplot(aes(x = pct_Black, y = occ, color = diff_from_mean)) +
    geom_point(size = 3) + 
    geom_vline(aes(xintercept = avg_pct_black), linetype = 2) + 
    scale_color_viridis() + 
    theme(legend.position = 'none',
           plot.title.position = "plot", # NEW parameter. Apply for subtitle too.
    plot.caption.position = "plot",
    plot.margin = margin(r=25, l=25, t=25, b=25)
    ) + 
    labs(
        title = 'Top 8 Highest and Lowest Jobs by African American representation', 
        subtitle = 'Grouped by specific employment classification, not overall sector',
        x = "Percent African American",
        y = "Employment Role",
        color = 'Difference from overall percent'
        
    ) + facet_free(type ~ .)
acs_occ_fig_2#+ coord_flip()
combined_occ <- acs_occ_fig / acs_occ_fig_2 + plot_layout(heights = c(1.5,1)) + plot_annotation(theme = theme(legend.position = 'none', plot.margin = margin(b = 10)), caption = 'Made by Elizabeth Goodwin | 2010 OCCSCORE, 2015-2019 ACS, IPUMS USA')
ggsave('original-GOODWIN.pdf', plot = combined_occ, dpi = 400, height = 10, width = 8)
#ggsave('original-GOODWIN.png', plot = combined_occ, dpi = 400, height = 10, width = 8)
