JRaviLab · eboyer221 · Jun 2, 2026 · Mar 5, 2026 · Mar 5, 2026 · May 22, 2026
diff --git a/vignettes/BVBRC_stats.Rmd b/vignettes/BVBRC_stats.Rmd
@@ -0,0 +1,264 @@
+---
+title: "BV-BRC Metadata Statistics"
+author: "Abhirupa Ghosh"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{BV-BRC Metadata Statistics}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+This vignette summarizes the completeness of the bacterial metadata table served by BV-BRC. It pulls the full bacterial-genome metadata dump from BV-BRC via the project's CLI Docker image, normalizes character columns (UTF-8 fixes, whitespace trimming, mapping placeholder strings like `"NA"`, `"unknown"`, and `"not reported"` to actual `NA`s), and reports per-column missingness plus block-level presence/absence for host and geographic fields. The goal is to help decide which BV-BRC columns are worth featurizing downstream.
+
+```{r setup, include=FALSE}
+library(dplyr)
+library(stringr)
+library(purrr)
+library(tibble)
+```
+
+## Helper functions
+
+The chunk below defines all helpers used in the rest of the vignette:
+
+- `fetchCompleteBVBRCMetadata()` — calls `docker run --rm danylmb/bvbrc:5.3 p3-all-genomes` twice: once to enumerate the available attributes, then again to pull the full bacterial-genome metadata table with those attributes selected. Returns a tibble with all character columns coerced to UTF-8.
+- `is_placeholder_na()` and `clean_all_char_cols()` — normalize text columns by trimming whitespace, fixing UTF-8 glitches, and treating sentinel strings (`"NA"`, `"unknown"`, `"not reported"`, etc.) as `NA` rather than valid values.
+- `summarize_block()` — computes per-row presence/absence summaries across a group of related columns (e.g. all host columns or all geographic columns).
+
+```{r functions, echo=FALSE}
+# function to download BVBRC metadata for all Bacteria
+fetchCompleteBVBRCMetadata <- function(
+  image = "danylmb/bvbrc:5.3",
+  verbose = TRUE
+) {
+  if (isTRUE(verbose)) {
+    message("Building attribute list from BV-BRC...")
+  }
+
+  # Step 1: get cleaned attribute list
+  attr_cmd <- glue::glue(
+    "docker run --rm {image} p3-all-genomes -f | ",
+    "tail -n +2 | ",
+    "sed -E 's/ \\((multi|related|derived)\\)//g' | ",
+    "tr '\\n' ',' | sed 's/,$//'"
+  )
+
+  ATTRS <- tryCatch(
+    system(attr_cmd, intern = TRUE),
+    error = function(e) {
+      stop("Failed to retrieve attribute list.\n", conditionMessage(e))
+    }
+  )
+
+  if (length(ATTRS) == 0L) {
+    stop("Attribute list is empty.")
+  }
+
+  if (isTRUE(verbose)) {
+    message("Fetching BV-BRC bacterial genome metadata (this may take time)...")
+  }
+
+  # Step 2: use ATTRS in main query
+  cmd <- glue::glue(
+    "docker run --rm {image} p3-all-genomes ",
+    "--in superkingdom,Bacteria ",
+    "--attr \"{ATTRS}\""
+  )
+
+  raw_data <- tryCatch(
+    system(cmd, intern = TRUE, ignore.stderr = TRUE),
+    error = function(e) {
+      stop("BV-BRC CLI call failed.\n", conditionMessage(e))
+    }
+  )
+
+  if (length(raw_data) == 0L) {
+    stop("BV-BRC returned no data. The service may be unavailable.")
+  }
+
+  # Step 3: parse safely
+  df <- utils::read.table(
+    text = raw_data,
+    sep = "\t",
+    header = TRUE,
+    fill = TRUE,
+    quote = "",
+    check.names = FALSE,
+    comment.char = "",
+    colClasses = "character"
+  )
+
+  df <- tibble::as_tibble(df) |>
+    dplyr::mutate(dplyr::across(
+      dplyr::everything(),
+      ~ iconv(.x, from = "", to = "UTF-8", sub = "")
+    ))
+
+  if (isTRUE(verbose)) {
+    message(glue::glue("Retrieved {nrow(df)} rows x {ncol(df)} columns."))
+  }
+
+  return(df)
+}
+
+# Use your fix_utf8() if defined; otherwise a no-op fallback
+if (!exists("fix_utf8")) {
+  fix_utf8 <- function(x) x
+}
+
+# Tokens that should count as missing for text fields
+missing_tokens <- c(
+  "", " ", "NA", "N/A", "n/a", "na", "null", "Null", "NULL",
+  "unknown", "Unknown", "UNKNOWN",
+  "not provided", "Not provided", "NOT PROVIDED",
+  "not reported", "Not reported", "NOT REPORTED",
+  "unavailable", "Unspecified", "unspecified", "missing"
+)
+
+is_placeholder_na <- function(x) {
+  if (!is.character(x)) {
+    return(rep(FALSE, length(x)))
+  }
+  str_trim(x) %in% missing_tokens
+}
+
+# Clean ALL character/factor columns in the data frame
+clean_all_char_cols <- function(df) {
+  df |>
+    mutate(across(
+      where(~ is.character(.x) || is.factor(.x)),
+      ~ {
+        x <- .
+        if (is.factor(x)) x <- as.character(x)
+        if (is.character(x)) {
+          x <- fix_utf8(x)
+          x <- str_squish(x) # trim + collapse internal spaces
+          x[is_placeholder_na(x)] <- NA # "", "N/A", "unknown", etc. -> NA
+        }
+        x
+      }
+    ))
+}
+
+# --- Block-level summaries ---------------------------------------------------
+summarize_block <- function(df, cols, block_name) {
+  if (length(cols) == 0) {
+    return(tibble(
+      block = block_name,
+      n_total = nrow(df),
+      n_rows_all_missing = nrow(df),
+      pct_all_missing = 100,
+      n_rows_any_present = 0,
+      pct_any_present = 0
+    ))
+  }
+  df |>
+    transmute(
+      .row_any_present = if_any(all_of(cols), ~ !is.na(.x)),
+      .row_all_missing = if_all(all_of(cols), is.na)
+    ) |>
+    summarise(
+      block = block_name,
+      n_total = n(),
+      n_rows_all_missing = sum(.row_all_missing),
+      pct_all_missing = 100 * n_rows_all_missing / n_total,
+      n_rows_any_present = sum(.row_any_present),
+      pct_any_present = 100 * n_rows_any_present / n_total,
+      .groups = "drop"
+    )
+}
+```
+
+## Compute statistics
+
+Here we fetch the BV-BRC bacterial metadata table, clean it, and compute four artifacts that drive the rest of the vignette: `col_stats` (one row per column with totals, missing counts, and distinct non-missing values), `host_block_stats` and `geo_block_stats` (per-block presence summaries across host and geographic columns respectively), and `diag_table` (a focused per-column view of just the host + geo columns).
+
+```{r stats}
+# download the bvbrc table
+bvbrc <- fetchCompleteBVBRCMetadata()
+
+# Apply to your table
+bvbrc_clean <- clean_all_char_cols(bvbrc)
+# --- Per-column stats --------------------------------------------------------
+col_stats <- tibble(column = colnames(bvbrc_clean)) |>
+  mutate(
+    n_total = nrow(bvbrc_clean),
+    n_missing = map_int(column, ~ sum(is.na(bvbrc_clean[[.x]]))),
+    n_present = n_total - n_missing,
+    pct_missing = 100 * n_missing / n_total,
+    n_distinct_non_missing = map_int(column, ~ n_distinct(bvbrc_clean[[.x]], na.rm = TRUE))
+  ) |>
+  arrange(desc(pct_missing), column)
+
+# --- Define blocks -----------------------------------------------------------
+host_cols <- c(
+  "genome.host_common_name",
+  "genome.host_group",
+  "genome.host_name"
+)
+
+geo_cols <- c(
+  "genome.isolation_source",
+  "genome.geographic_location",
+  "genome.isolation_country",
+  "genome.state_province",
+  "genome.city",
+  "genome.county",
+  "genome.latitude",
+  "genome.longitude"
+)
+
+host_cols <- intersect(host_cols, colnames(bvbrc))
+geo_cols <- intersect(geo_cols, colnames(bvbrc))
+
+host_block_stats <- summarize_block(bvbrc_clean, host_cols, "host")
+geo_block_stats <- summarize_block(bvbrc_clean, geo_cols, "geographic")
+
+# --- Quick diagnostics: check specific columns you care about ----------------
+diag_cols <- unique(c(host_cols, geo_cols))
+diag_table <- tibble(column = diag_cols) |>
+  mutate(
+    n_total = nrow(bvbrc_clean),
+    n_missing = map_int(column, ~ sum(is.na(bvbrc_clean[[.x]]))),
+    n_present = n_total - n_missing,
+    pct_missing = 100 * n_missing / n_total
+  ) |>
+  arrange(desc(pct_missing), column)
+```
+
+## Per-column completeness
+
+The table below shows how complete each column in the BV-BRC bacterial metadata is — sorted with the most sparsely populated columns first. Columns with very high `pct_missing` are unlikely to be useful downstream and may be worth dropping from featurization.
+
+```{r col_stats_output}
+cat("\n--- Per-column stats (whole table) ---\n")
+print(col_stats, n = Inf)
+```
+
+## Host block summary
+
+Presence of the host-related columns (`host_common_name`, `host_group`, `host_name`) treated as a block. `n_rows_any_present` is the number of rows where at least one host field is populated; `n_rows_all_missing` is the number where every host field is empty.
+
+```{r host_block_output}
+cat("\n--- Host block stats ---\n")
+print(host_block_stats)
+```
+
+## Geographic block summary
+
+Same idea as the host block, but across the geographic columns (`isolation_source`, `geographic_location`, `isolation_country`, `state_province`, `city`, `county`, `latitude`, `longitude`).
+
+```{r geo_block_output}
+cat("\n--- Geographic block stats ---\n")
+print(geo_block_stats)
+```
+
+## Diagnostics: per-column breakdown of host + geo
+
+Column-by-column missingness for just the host and geographic fields, so you can see which specific columns drive the block-level numbers above.
+
+```{r diag_output}
+cat("\n--- Diagnostics for host + geo columns ---\n")
+print(diag_table, n = Inf)
+```