--- title: "GeoTox Package Data" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{GeoTox Package Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This package includes example data `geo_tox_data`. Below is a description of the data and example code for how it was gathered. > **NOTE:** FIPS codes can change. Since data is being pulled from various sources, ensure that the FIPS values can be used to connect data across these sources. For example, in 2022 Connecticut began the process of going from 8 legacy counties to 9 planning regions. ```{r setup, message=FALSE} library(dplyr) library(sf) library(tidyr) library(readr) library(stringr) library(purrr) library(readxl) library(httk) library(httr2) geo_tox_data <- list() ``` # Chemical data ## Exposure data Download modeled exposure data from AirToxScreen. Results from AirToxScreen 2019 for a subset of chemicals in North Carolina counties are included in the package data as `geo_tox_data$exposure`. > **NOTE:** The 2020 release does not currently provide a single file for the exposure national concentration summaries. The 2019 data can be found following the "Previous air toxics assessments" link. ```{r, eval=FALSE} filename <- "2019_Toxics_Exposure_Concentrations.xlsx" tmp <- tempfile(filename) download.file( paste0("https://www.epa.gov/system/files/documents/2022-12/", filename), tmp ) exposure <- read_xlsx(tmp) # Normalization function min_max_norm = function(x) { min_x <- min(x, na.rm = TRUE) max_x <- max(x, na.rm = TRUE) if (min_x == max_x) { rep(0, length(x)) } else { (x - min_x) / (max_x - min_x) } } geo_tox_data$exposure <- exposure |> # North Carolina counties filter(State == "NC", !grepl("0$", FIPS)) |> # Aggregate chemicals by county summarize(across(-c(State:Tract), c(mean, sd)), .by = FIPS) |> pivot_longer(-FIPS, names_to = "chemical") |> mutate(stat = if_else(grepl("_1$", chemical), "mean", "sd"), chemical = gsub('.{2}$', '', chemical)) |> pivot_wider(names_from = stat) |> # Normalize concentrations mutate(norm = min_max_norm(mean), .by = chemical) ``` ### Get CASRN and PREFERRED_NAME from CompTox Dashboard ```{r, eval=FALSE} # Copy/paste the chemical names into the batch search field # https://comptox.epa.gov/dashboard/batch-search cat(geo_tox_data$exposure |> distinct(chemical) |> pull(), sep = "\n") # Export results with "CAS-RN" identifiers as a csv file, then process in R exposure_casrn <- read_csv("CCD-Batch-Search.csv", show_col_types = FALSE) |> filter(DTXSID != "N/A") |> # Prioritize results based on FOUND_BY status arrange(INPUT, grepl("Approved Name", FOUND_BY), grepl("^Synonym", FOUND_BY)) |> # Keep one result per INPUT group_by(INPUT) |> slice(1) |> ungroup() # Update exposure data with CompTox Dashboard data geo_tox_data$exposure <- geo_tox_data$exposure |> inner_join(exposure_casrn, by = join_by(chemical == INPUT)) |> select(FIPS, casn = CASRN, chnm = PREFERRED_NAME, mean, sd, norm) ``` ## ICE cHTS data Use ICE cHTS data to identify active chemicals for a given set of assays. ```{r, eval=FALSE} get_cHTS_hits <- function(assays = NULL, chemids = NULL) { if (is.null(assays) & is.null(chemids)) { stop("Must provide at least one of 'assays' or 'chemids'") } # Format query parameters req_params <- list() if (!is.null(assays)) { if (!is.list(assays)) assays <- as.list(assays) req_params$assays <- assays } if (!is.null(chemids)) { if (!is.list(chemids)) chemids <- as.list(chemids) req_params$chemids <- chemids } # Query ICE API resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/search") |> req_body_json(req_params) |> req_perform() if (resp$status_code != 200) { stop("Failed to retrieve data from ICE API") } # Return active chemicals result <- resp |> resp_body_json() |> pluck("endPoints") fields <- c("assay", "casrn", "dtxsid", "substanceName", "endpoint", "value") map(fields, \(x) map_chr(result, x)) |> set_names(fields) |> bind_cols() |> filter(endpoint == "Call", value == "Active") |> select(-c(endpoint, value)) |> distinct() } assays <- c("APR_HepG2_p53Act_1h_dn", "APR_HepG2_p53Act_1h_up", "APR_HepG2_p53Act_24h_dn", "APR_HepG2_p53Act_24h_up", "APR_HepG2_p53Act_72h_dn", "APR_HepG2_p53Act_72h_up", "ATG_p53_CIS_up", "TOX21_DT40", "TOX21_DT40_100", "TOX21_DT40_657", "TOX21_ELG1_LUC_Agonist", "TOX21_H2AX_HTRF_CHO_Agonist_ratio", "TOX21_p53_BLA_p1_ratio", "TOX21_p53_BLA_p2_ratio", "TOX21_p53_BLA_p3_ratio", "TOX21_p53_BLA_p4_ratio", "TOX21_p53_BLA_p5_ratio") chemids <- unique(geo_tox_data$exposure$casn) cHTS_hits_API <- get_cHTS_hits(assays = assays, chemids = chemids) ``` ## Dose-response data Use the ICE API to retrieve dose-response data for selected assays and chemicals. ```{r, eval=FALSE} get_ICE_dose_resp <- function(assays = NULL, chemids = NULL) { if (is.null(assays) & is.null(chemids)) { stop("Must provide at least one of 'assays' or 'chemids'") } # Format query parameters req_params <- list() if (!is.null(assays)) { if (!is.list(assays)) assays <- as.list(assays) req_params$assays <- assays } if (!is.null(chemids)) { if (!is.list(chemids)) chemids <- as.list(chemids) req_params$chemids <- chemids } # Query ICE API resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/curves") |> req_body_json(req_params) |> req_perform() if (resp$status_code != 200) { stop("Failed to retrieve data from ICE API") } # Return dose-response data result <- resp |> resp_body_json() |> pluck("curves") map(result, function(x) { tibble( endp = x[["assay"]], casn = x[["casrn"]], call = x[["dsstoxsid"]], chnm = x[["substance"]], call = x[["call"]], logc = map_dbl(x[["concentrationResponses"]], "concentration") |> log10(), resp = map_dbl(x[["concentrationResponses"]], "response") ) }) |> bind_rows() } assays <- unique(cHTS_hits_API$assay) chemids <- intersect(cHTS_hits_API$casrn, geo_tox_data$exposure$casn) dose_response <- get_ICE_dose_resp(assays = assays, chemids = chemids) # Only keep active calls for assay/chemical combinations geo_tox_data$dose_response <- dose_response |> filter(call == "Active") |> select(-call) # Update dose-response data with CompTox Dashboard data geo_tox_data$dose_response <- geo_tox_data$dose_response |> inner_join(exposure_casrn, by = join_by(casn == CASRN)) |> select(endp, casn, chnm = PREFERRED_NAME, logc, resp) ``` # Population data ## Age Download age data from the U.S. Census Bureau by searching for "County Population by Characteristics". A subset of data for North Carolina from 2019 is included in the package data as `geo_tox_data$age`. ```{r, eval=FALSE} # Data for North Carolina url <- paste0("https://www2.census.gov/programs-surveys/popest/datasets/", "2010-2019/counties/asrh/cc-est2019-alldata-37.csv") age <- read_csv(url, show_col_types = FALSE) geo_tox_data$age <- age |> # 7/1/2019 population estimate filter(YEAR == 12) |> # Create FIPS mutate(FIPS = str_c(STATE, COUNTY)) |> # Keep selected columns select(FIPS, AGEGRP, TOT_POP) ``` ## Obesity Follow the "Data Portal" link from CDC PLACES and search for "places county data". Go to the desired dataset webpage, for example 2020 county data, and download the data by selecting Actions → API → Download file. A subset of data for North Carolina is included in the package data as `geo_tox_data$obesity`. ```{r, eval=FALSE} places <- read_csv("PLACES__County_Data__GIS_Friendly_Format___2020_release.csv", show_col_types = FALSE) # Convert confidence interval to standard deviation extract_SD <- function(x) { range <- as.numeric(str_split_1(str_sub(x, 2, -2), ",")) diff(range) / 3.92 } geo_tox_data$obesity <- places |> # North Carolina Counties filter(StateAbbr == "NC") |> # Select obesity data select(FIPS = CountyFIPS, OBESITY_CrudePrev, OBESITY_Crude95CI) |> # Change confidence interval to standard deviation rowwise() |> mutate(OBESITY_SD = extract_SD(OBESITY_Crude95CI)) |> ungroup() |> select(-OBESITY_Crude95CI) ``` ## Steady-state plasma concentration (`C_ss`) Use `httk` to generate `C_ss` values for combinations of age group and weight status for each chemical. The generation of these values is a time-intensive step, so one approach is to generate populations of `C_ss` values initially and then sample them later. ```{r, eval=FALSE} set.seed(2345) n_samples <- 500 # Get CASN for which httk simulation is possible. Try using load_dawson2021, # load_sipes2017, or load_pradeep2020 to increase availability. load_sipes2017() casn <- intersect(unique(geo_tox_data$dose_response$casn), get_cheminfo(suppress.messages = TRUE)) # Define population demographics for httk simulation pop_demo <- cross_join( tibble(age_group = list(c(0, 2), c(3, 5), c(6, 10), c(11, 15), c(16, 20), c(21, 30), c(31, 40), c(41, 50), c(51, 60), c(61, 70), c(71, 100))), tibble(weight = c("Normal", "Obese"))) |> # Create column of lower age_group values rowwise() |> mutate(age_min = age_group[1]) |> ungroup() # Create wrapper function around httk steps simulate_css <- function(chem.cas, agelim_years, weight_category, samples, verbose = TRUE) { if (verbose) { cat(chem.cas, paste0("(", paste(agelim_years, collapse = ", "), ")"), weight_category, "\n") } httkpop <- list(method = "vi", gendernum = NULL, agelim_years = agelim_years, agelim_months = NULL, weight_category = weight_category, reths = c( "Mexican American", "Other Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Other" )) css <- try( suppressWarnings({ mcs <- create_mc_samples(chem.cas = chem.cas, samples = samples, httkpop.generate.arg.list = httkpop, suppress.messages = TRUE) calc_analytic_css(chem.cas = chem.cas, parameters = mcs, model = "3compartmentss", suppress.messages = TRUE) }), silent = TRUE ) # Return if (is(css, "try-error")) { warning(paste0("simulate_css failed to generate data for CASN ", chem.cas)) list(NA) } else { list(css) } } # Simulate `C_ss` values simulated_css <- map(casn, function(chem.cas) { pop_demo |> rowwise() |> mutate( css = simulate_css(.env$chem.cas, age_group, weight, .env$n_samples) ) |> ungroup() }) simulated_css <- setNames(simulated_css, casn) # Remove CASN that failed simulate_css casn_keep <- map_lgl(simulated_css, function(df) { !(length(df$css[[1]]) == 1 && is.na(df$css[[1]])) }) simulated_css <- simulated_css[casn_keep] # Get median `C_ss` values for each age_group simulated_css <- map( simulated_css, function(cas_df) { cas_df |> nest(.by = age_group) |> mutate( age_median_css = map_dbl(data, function(df) median(unlist(df$css), na.rm = TRUE)) ) |> unnest(data) } ) # Get median `C_ss` values for each weight simulated_css <- map( simulated_css, function(cas_df) { cas_df |> nest(.by = weight) |> mutate( weight_median_css = map_dbl(data, function(df) median(unlist(df$css), na.rm = TRUE)) ) |> unnest(data) |> arrange(age_min, weight) } ) geo_tox_data$simulated_css <- simulated_css ``` # Prune data Retain only those chemicals found in exposure, dose-response and `C_ss` datasets. ```{r, eval=FALSE} casn_keep <- names(geo_tox_data$simulated_css) geo_tox_data$exposure <- geo_tox_data$exposure |> filter(casn %in% casn_keep) geo_tox_data$dose_response <- geo_tox_data$dose_response |> filter(casn %in% casn_keep) ``` # County/State boundaries Download cartographic boundary files for counties and states from the U.S. Census Bureau. The geometry data for North Carolina counties and the state are included in the package data as `geo_tox_data$boundaries`. ```{r, eval=FALSE} county <- st_read("cb_2019_us_county_5m/cb_2019_us_county_5m.shp") state <- st_read("cb_2019_us_state_5m/cb_2019_us_state_5m.shp") geo_tox_data$boundaries <- list( county = county |> filter(STATEFP == 37) |> select(FIPS = GEOID, geometry), state = state |> filter(STATEFP == 37) |> select(geometry) ) ```