-
Notifications
You must be signed in to change notification settings - Fork 1
/
anthopolos.R
269 lines (254 loc) · 12 KB
/
anthopolos.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#' Racial Isolation Index based on Anthopolos et al. (2011)
#'
#' Compute the spatial Racial Isolation Index (Anthopolos) of selected subgroup(s).
#'
#' @param geo Character string specifying the geography of the data either counties \code{geo = 'county'}, census tracts \code{geo = 'tract'} (the default), or census block groups \code{geo = 'cbg'}.
#' @param year Numeric. The year to compute the estimate. The default is 2020, and the years 2009 onward are currently available.
#' @param subgroup Character string specifying the racial or ethnic subgroup(s). See Details for available choices.
#' @param crs Numeric or character string specifying the coordinate reference system to compute the distance-based metric. The default is Albers North America \code{crs = 'ESRI:102008'}.
#' @param quiet Logical. If TRUE, will display messages about potential missing census information. The default is FALSE.
#' @param ... Arguments passed to \code{\link[tidycensus]{get_acs}} to select state, county, and other arguments for census characteristics
#'
#' @details This function will compute the spatial Racial Isolation Index (\emph{RI}) of U.S. census tracts or counties for a specified geographical extent (e.g., the entire U.S. or a single state) based on Anthopolos et al. (2011) \doi{10.1016/j.sste.2011.06.002} who originally designed the metric for the racial isolation of non-Hispanic Black individuals. This function provides the computation of \emph{RI} for any of the U.S. Census Bureau race or ethnicity subgroups (including Hispanic and non-Hispanic individuals).
#'
#' The function uses the \code{\link[tidycensus]{get_acs}} function to obtain U.S. Census Bureau 5-year American Community Survey characteristics used for the geospatial computation. The yearly estimates are available for 2009 onward when ACS-5 data are available but are available from other U.S. Census Bureau surveys. The twenty racial or ethnic subgroups (U.S. Census Bureau definitions) are:
#' \itemize{
#' \item \strong{B03002_002}: not Hispanic or Latino \code{'NHoL'}
#' \item \strong{B03002_003}: not Hispanic or Latino, white alone\code{'NHoLW'}
#' \item \strong{B03002_004}: not Hispanic or Latino, Black or African American alone \code{'NHoLB'}
#' \item \strong{B03002_005}: not Hispanic or Latino, American Indian and Alaska Native alone \code{'NHoLAIAN'}
#' \item \strong{B03002_006}: not Hispanic or Latino, Asian alone \code{'NHoLA'}
#' \item \strong{B03002_007}: not Hispanic or Latino, Native Hawaiian and Other Pacific Islander alone \code{'NHoLNHOPI'}
#' \item \strong{B03002_008}: not Hispanic or Latino, Some other race alone \code{'NHoLSOR'}
#' \item \strong{B03002_009}: not Hispanic or Latino, Two or more races \code{'NHoLTOMR'}
#' \item \strong{B03002_010}: not Hispanic or Latino, Two races including Some other race \code{'NHoLTRiSOR'}
#' \item \strong{B03002_011}: not Hispanic or Latino, Two races excluding Some other race, and three or more races \code{'NHoLTReSOR'}
#' \item \strong{B03002_012}: Hispanic or Latino \code{'HoL'}
#' \item \strong{B03002_013}: Hispanic or Latino, white alone \code{'HoLW'}
#' \item \strong{B03002_014}: Hispanic or Latino, Black or African American alone \code{'HoLB'}
#' \item \strong{B03002_015}: Hispanic or Latino, American Indian and Alaska Native alone \code{'HoLAIAN'}
#' \item \strong{B03002_016}: Hispanic or Latino, Asian alone \code{'HoLA'}
#' \item \strong{B03002_017}: Hispanic or Latino, Native Hawaiian and Other Pacific Islander alone \code{'HoLNHOPI'}
#' \item \strong{B03002_018}: Hispanic or Latino, Some other race alone \code{'HoLSOR'}
#' \item \strong{B03002_019}: Hispanic or Latino, Two or more races \code{'HoLTOMR'}
#' \item \strong{B03002_020}: Hispanic or Latino, Two races including Some other race \code{'HoLTRiSOR'}
#' \item \strong{B03002_021}: Hispanic or Latino, Two races excluding Some other race, and three or more races \code{'HoLTReSOR'}
#' }
#'
#' Use the internal \code{state} and \code{county} arguments within the \code{\link[tidycensus]{get_acs}} function to specify geographic extent of the data output. NOTE: Current version does not correct for edge effects (e.g., census geographies along the specified spatial extent border, coastline, or U.S.-Mexico / U.S.-Canada border) may have few neighboring census geographies, and \emph{RI} values in these census geographies may be unstable. A stop-gap solution for the former source of edge effect is to compute the \emph{RI} for neighboring census geographies (i.e., the states bordering a study area of interest) and then use the estimates of the study area of interest.
#'
#' A census geography (and its neighbors) that has nearly all of its population who identify with the specified race or ethnicity subgroup(s) (e.g., non-Hispanic or Latino, Black or African American alone) will have an \emph{RI} value close to 1. In contrast, a census geography (and its neighbors) that has nearly none of its population who identify with the specified race or ethnicity subgroup(s) (e.g., not non-Hispanic or Latino, Black or African American alone) will have an \emph{RI} value close to 0.
#'
#' @return An object of class 'list'. This is a named list with the following components:
#'
#' \describe{
#' \item{\code{ri}}{An object of class 'tbl' for the GEOID, name, \emph{RI}, and raw census values of specified census geographies.}
#' \item{\code{missing}}{An object of class 'tbl' of the count and proportion of missingness for each census variable used to compute \emph{RI}.}
#' }
#'
#' @import dplyr
#' @importFrom Matrix sparseMatrix
#' @importFrom sf st_drop_geometry st_geometry st_intersects st_transform
#' @importFrom stringr str_trim
#' @importFrom tidycensus get_acs
#' @importFrom tidyr pivot_longer separate
#' @export
#'
#' @seealso \code{\link[tidycensus]{get_acs}} for additional arguments for geographic extent selection (i.e., \code{state} and \code{county}).
#'
#' @examples
#' \dontrun{
#' # Wrapped in \dontrun{} because these examples require a Census API key.
#'
#' # Racial Isolation Index (a measure of isolation)
#' ## of Black populations
#' ## in census tracts of Georgia, U.S.A. (2020)
#' anthopolos(
#' geo = 'tract',
#' state = 'GA',
#' year = 2020,
#' subgroup = c('NHoLB', 'HoLB')
#' )
#'
#' }
#'
anthopolos <- function(geo = 'tract',
year = 2020,
subgroup,
crs = 'ESRI:102008',
quiet = FALSE,
...) {
# Check arguments
match.arg(geo, choices = c('county', 'tract', 'cbg', 'block group'))
stopifnot(is.numeric(year), year >= 2009) # all variables available 2009 onward
match.arg(
subgroup,
several.ok = TRUE,
choices = c(
'NHoL',
'NHoLW',
'NHoLB',
'NHoLAIAN',
'NHoLA',
'NHoLNHOPI',
'NHoLSOR',
'NHoLTOMR',
'NHoLTRiSOR',
'NHoLTReSOR',
'HoL',
'HoLW',
'HoLB',
'HoLAIAN',
'HoLA',
'HoLNHOPI',
'HoLSOR',
'HoLTOMR',
'HoLTRiSOR',
'HoLTReSOR'
)
)
# Select census variables
vars <- c(
TotalPop = 'B03002_001',
NHoL = 'B03002_002',
NHoLW = 'B03002_003',
NHoLB = 'B03002_004',
NHoLAIAN = 'B03002_005',
NHoLA = 'B03002_006',
NHoLNHOPI = 'B03002_007',
NHoLSOR = 'B03002_008',
NHoLTOMR = 'B03002_009',
NHoLTRiSOR = 'B03002_010',
NHoLTReSOR = 'B03002_011',
HoL = 'B03002_012',
HoLW = 'B03002_013',
HoLB = 'B03002_014',
HoLAIAN = 'B03002_015',
HoLA = 'B03002_016',
HoLNHOPI = 'B03002_017',
HoLSOR = 'B03002_018',
HoLTOMR = 'B03002_019',
HoLTRiSOR = 'B03002_020',
HoLTReSOR = 'B03002_021'
)
selected_vars <- vars[c('TotalPop', subgroup)]
out_names <- names(selected_vars) # save for output
prefix <- 'subgroup'
suffix <- seq(1:length(subgroup))
names(selected_vars) <- c('TotalPop', paste0(prefix, suffix))
in_names <- paste0(names(selected_vars), 'E')
# Acquire RI variables and sf geometries
out_dat <- suppressMessages(suppressWarnings(
tidycensus::get_acs(
geography = geo,
year = year,
output = 'wide',
variables = selected_vars,
geometry = TRUE,
...
)
))
# Format output
if (geo == 'county') {
out_dat <- out_dat %>%
tidyr::separate(NAME, into = c('county', 'state'), sep = ',')
}
if (geo == 'tract') {
out_dat <- out_dat %>%
tidyr::separate(NAME, into = c('tract', 'county', 'state'), sep = ',') %>%
dplyr::mutate(tract = gsub('[^0-9\\.]', '', tract))
}
if (geo == 'cbg' | geo == 'block group') {
out_dat <- out_dat %>%
tidyr::separate(NAME, into = c('cbg', 'tract', 'county', 'state'), sep = ',') %>%
dplyr::mutate(
tract = gsub('[^0-9\\.]', '', tract), cbg = gsub('[^0-9\\.]', '', cbg)
)
}
out_dat <- out_dat %>%
dplyr::mutate(subgroup = rowSums(sf::st_drop_geometry(out_dat[, in_names[-1]])))
# Compute RI
## From Anthopolos et al. (2011) https://doi.org/10.1016/j.sste.2011.06.002
## RI_{im} = (Sigma_{j∈∂_{i}} w_{ij} * T_{jm}) / (Sigma_{j∈∂_{i}} w_{ij} * T_{j})
## Where:
## ∂_{i} denotes the set of index units i and its neighbors
## Given M mutually exclusive racial or ethnic subgroups, m indexes the subgroups of M
## T_{i} denotes the total population in region i (TotalPop)
## T_{im} denotes the population of the selected subgroup(s) (subgroup1, ...)
## w_{ij} denotes a nXn first-order adjacency matrix, where n is the number of census geometries in the study area
### and the entries of w_{ij} are set to 1 if a boundary is shared by region i and region j and zero otherwise
### Entries of the main diagonal (since i∈∂_{i}, w_{ij} = w_{ii} when j = i) of w_{ij} are set to 1.5
### such that the weight of the index unit, i, is larger than the weights assigned to adjacent tracts
## Geospatial adjacency matrix (w_ij)
tmp <- out_dat %>%
sf::st_transform(crs = crs) %>%
sf::st_geometry() %>%
sf::st_intersects(sparse = TRUE)
names(tmp) <- as.character(seq_len(nrow(out_dat)))
tmp_L <- length(tmp)
tmp_counts <- unlist(Map(length, tmp))
tmp_i <- rep(1:tmp_L, tmp_counts)
tmp_j <- unlist(tmp)
w_ij <- Matrix::sparseMatrix(
i = tmp_i,
j = tmp_j,
x = 1,
dims = c(tmp_L, tmp_L)
)
diag(w_ij) <- 1.5
## Compute
out_dat <- out_dat %>%
sf::st_drop_geometry() # drop geometries (can join back later)
out_tmp <- list()
for (i in 1:dim(w_ij)[1]) {
out_tmp[[i]] <- sum(as.matrix(w_ij[i,]) * out_dat[, 'subgroup']) /
sum(as.matrix(w_ij[i,]) * out_dat[, 'TotalPopE'])
}
out_dat$RI <- unlist(out_tmp)
# Warning for missingness of census characteristics
missingYN <- out_dat[, in_names]
names(missingYN) <- out_names
missingYN <- missingYN %>%
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = 'variable',
values_to = 'val'
) %>%
dplyr::group_by(variable) %>%
dplyr::summarise(
total = dplyr::n(),
n_missing = sum(is.na(val)),
percent_missing = paste0(round(mean(is.na(val)) * 100, 2), ' %')
)
if (quiet == FALSE) {
# Warning for missing census data
if (sum(missingYN$n_missing) > 0) {
message('Warning: Missing census data')
}
}
# Format output
if (geo == 'county') {
out <- out_dat %>%
dplyr::select(c(GEOID, state, county, RI, dplyr::all_of(in_names)))
names(out) <- c('GEOID', 'state', 'county', 'RI', out_names)
}
if (geo == 'tract') {
out <- out_dat %>%
dplyr::select(c(GEOID, state, county, tract, RI, dplyr::all_of(in_names)))
names(out) <- c('GEOID', 'state', 'county', 'tract', 'RI', out_names)
}
if (geo == 'cbg' | geo == 'block group') {
out <- out_dat %>%
dplyr::select(c(GEOID, state, county, tract, cbg, RI, dplyr::all_of(in_names)))
names(out) <- c('GEOID', 'state', 'county', 'tract', 'cbg', 'RI', out_names)
}
out <- out %>%
dplyr::mutate(
state = stringr::str_trim(state),
county = stringr::str_trim(county)
) %>%
dplyr::arrange(GEOID) %>%
dplyr::as_tibble()
out <- list(ri = out, missing = missingYN)
return(out)
}