This script prepares and summarizes livestock feed intake data to support the development of predictive models for estimating feed intake using various biological and nutritional variables. It merges cleaned extraction data with species, production stage, nutritional composition, digestibility, and outcome metrics such as body weight, daily gain, and milk or meat yield. The aim is to assess data coverage across species and variable combinations (e.g. BW + DMD, CP + ADG) and identify suitable subsets for building robust estimation models. The summary tables help evaluate where data is sufficient to support modeling efforts for different livestock groups.
library(tidyverse)
library(readr)
library(DT)
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
library(stringr)
library(knitr)
merged_metadata <- readRDS("~/ERL/merged_metadata.rds")
feed_intake <- read.csv("~/ERL/feed_intake.csv")
# ── 1) merge product info into the cleaned feed_intake ----------------------
prod_tbl <- merged_metadata$Prod.Out %>%
select(B.Code,
Species = P.Product, # rename for clarity
Variety = P.Variety,
P.Practice,
Stage = Herd.Stage) # keep stage here
# ── build lookup from the rows that already have a species -------------------
var_species_lu <- prod_tbl %>% # from previous step
filter(Species != "Not Found", !is.na(Variety)) %>%
distinct(Variety, Species) # one row per variety
# ── back-fill missing species ------------------------------------------------
prod_tbl <- prod_tbl %>%
left_join(var_species_lu, by = "Variety", suffix = c("", ".lu")) %>%
mutate(
Species = if_else(Species == "Not Found" | is.na(Species),
Species.lu, Species)
) %>%
select(-Species.lu) # drop helper column
feed_intake <- feed_intake %>%
left_join(prod_tbl, by = "B.Code")%>%
select(-c("X"))
This section summarizes how many unique papers include feed intake data for each species. The goal is to understand species-level data availability.
# Number of unique papers per species
papers_per_species <- feed_intake %>%
filter(!is.na(Species)) %>%
group_by(Species) %>%
summarise(
n_unique_papers = n_distinct(B.Code),
.groups = "drop"
) %>%
arrange(Species)
datatable(
papers_per_species,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-weight: bold;",
"Number of Unique Papers with Feed Intake Information by Species"
),
options = list(
pageLength = 10,
autoWidth = TRUE
),
rownames = FALSE
)
This summary table breaks down feed intake observations by both species and variety. It includes the number of observations and papers, as well as the average and standard deviation of reported feed intake values. This helps assess the data’s granularity and distribution across different breeds or livestock types.
## ── 3) summary table for Claudia --------------------------------------------
fi_summary <- feed_intake %>%
filter(!is.na(Species)) %>% # ← drop NA species rows
group_by(Species, Variety) %>%
summarise(
n_obs = n(),
n_bcodes = n_distinct(B.Code), # ← number of unique B.Codes
mean_kg = mean(ED.Mean.T, na.rm = TRUE),
sd_kg = sd (ED.Mean.T, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
mean_kg = format(mean_kg, digits = 3, scientific = FALSE),
sd_kg = format(sd_kg, digits = 3, scientific = FALSE)
) %>%
arrange(Species, Variety)
datatable(
fi_summary,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-weight: bold;",
"Summary of Feed Intake by Species and Variety"
),
options = list(
pageLength = 10,
autoWidth = TRUE
),
rownames = FALSE
)
To model feed intake effectively, we need to connect it with other biological outcome data. In this section, we join feed intake records with key variables such as bodyweight, average daily gain (ADG), milk yield, and meat yield. These variables serve as potential predictors in estimation models and allow us to quantify animal performance alongside feed intake.
##############################################################################
# 1) ─── Helper outcome tables (deduplicated) ────────────────────────────────
##############################################################################
bw_tbl <- merged_metadata$Data.Out %>% # start-BW (kg)
filter(Out.Subind == "Weight Gain", !is.na(Out.WG.Start)) %>%
mutate(
Out.WG.Unit = tolower(Out.WG.Unit),
bw_kg = dplyr::case_when(
Out.WG.Unit == "g" ~ Out.WG.Start / 1e3,
Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
TRUE ~ Out.WG.Start # kg, unspecified, NA
)
) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(bw_kg = mean(bw_kg, na.rm = TRUE), .groups = "drop")
adg_tbl <- merged_metadata$Data.Out %>% # ADG (g day-¹)
filter(Out.Subind == "Weight Gain", !is.na(ED.Mean.T)) %>%
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
adg_g_day = dplyr::case_when(
grepl("^kg", Out.Unit) ~ ED.Mean.T * 1e3, # kg → g
grepl("^g", Out.Unit) ~ ED.Mean.T,
TRUE ~ NA_real_
)
) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(adg_g_day = mean(adg_g_day, na.rm = TRUE), .groups = "drop")
meat_yield_tbl <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Meat Yield", !is.na(ED.Mean.T)) %>%
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
meat_g_day = dplyr::case_when(
grepl("^kg", Out.Unit) ~ ED.Mean.T * 1e3, # kg → g
grepl("^g", Out.Unit) ~ ED.Mean.T,
TRUE ~ NA_real_
)
) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(meat_g_day = mean(meat_g_day, na.rm = TRUE), .groups = "drop")
milk_tbl <- merged_metadata$Data.Out %>% # milk (kg day-¹)
filter(Out.Subind == "Milk Yield", !is.na(ED.Mean.T)) %>%
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
milk_kg_day = dplyr::case_when(
grepl("^g", Out.Unit) ~ ED.Mean.T / 1e3,
grepl("^kg", Out.Unit) ~ ED.Mean.T,
grepl("^l", Out.Unit) ~ ED.Mean.T, # 1 L ≈ 1 kg
TRUE ~ NA_real_
)
) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(milk_kg_day = mean(milk_kg_day, na.rm = TRUE), .groups = "drop")
feed_intake_wide <- feed_intake %>%
## ── 1. Create tidy feed-intake columns ──────────────────────────────────
mutate(
feed_intake_unit = paste0(Out.Unit.Amount, "/", Out.Unit.Time), # e.g. "kg/day"
feed_intake_value = ED.Mean.T,
feed_intake_error = ED.Error
) %>%
## ── 2. Drop raw / helper columns no longer needed ──────────────────────
select(-ED.Intake.Item.Raw,
-ED.Mean.T, -ED.Error, -Out.Subind,
-Out.WG.Start, -Out.WG.Unit,
-Out.Unit.Animals, -Out.Unit.Time, -Out.Unit.Amount) %>%
## ── 3. Join BW, ADG, Milk tables ───────────────────────────────────────
left_join(bw_tbl, by = c("B.Code", "A.Level.Name")) %>%
left_join(adg_tbl, by = c("B.Code", "A.Level.Name")) %>%
left_join(milk_tbl, by = c("B.Code", "A.Level.Name")) %>%
left_join(meat_yield_tbl, by = c("B.Code", "A.Level.Name")) %>%
## ── 4. Carry single-value outcomes within each study up / down ─────────
group_by(B.Code) %>%
tidyr::fill(bw_kg, adg_g_day, milk_kg_day,meat_g_day, .direction = "downup") %>%
ungroup() %>%
## ── 5. Keep only unique rows ───────────────────────────────────────────
distinct() %>%
## ── 6. Arrange final column order for readability ──────────────────────
relocate(
ED.Intake.Item,
feed_intake_unit:feed_intake_error,
bw_kg, adg_g_day, milk_kg_day,
.after = A.Level.Name
)
Feed composition and digestibility are key drivers of intake and animal performance. In this section, we incorporate nutritional values—such as crude protein (CP), neutral detergent fiber (NDF), and others—and digestibility metrics like dry matter digestibility (DMD) into the dataset.
Originally, we used only the raw nutritional data extracted directly from the papers. However, to improve completeness and consistency, we also imputed missing nutritional values using open data sources such as Feedipedia and the ILRI feed composition database. These reference datasets were linked to each diet ingredient using our AOM ontology, allowing for standardized nutritional estimates even when papers lacked complete reporting.
The final merged dataset reflects a combination of reported and imputed values, joined based on whether each observation corresponds to an entire diet or a single feed ingredient.
nutrition<-merged_metadata$Animals.Diet.Comp
digestibility<-merged_metadata$Animals.Diet.Digest
nutrition_wide <- nutrition %>%
dplyr::mutate(DC.Value = as.numeric(DC.Value)) %>%
dplyr::group_by(B.Code, D.Item, DC.Variable,is_entire_diet) %>%
dplyr::summarise(
DC.Value = mean(DC.Value, na.rm = TRUE),
is_entire_diet = dplyr::first(is_entire_diet),
.groups = "drop"
) %>%
tidyr::pivot_wider(
names_from = DC.Variable,
values_from = DC.Value
) %>%
dplyr::rename(A.Level.Name = D.Item) %>%
dplyr::rename_with(~ paste0(.x, "_nutrition"), .cols = -c(B.Code, A.Level.Name, is_entire_diet)) %>%
dplyr::select(-is_entire_diet) # ← drop flag here
# Digestibility -------------------------------------------------------------
digestibility_wide <- digestibility %>%
dplyr::mutate(DD.Value = as.numeric(DD.Value)) %>%
dplyr::group_by(B.Code, D.Item, DD.Variable,is_entire_diet) %>%
dplyr::summarise(
DD.Value = mean(DD.Value, na.rm = TRUE),
is_entire_diet = dplyr::first(is_entire_diet),
.groups = "drop"
) %>%
tidyr::pivot_wider(
names_from = DD.Variable,
values_from = DD.Value
) %>%
dplyr::rename(A.Level.Name = D.Item) %>%
dplyr::rename_with(~ paste0(.x, "_digest"), .cols = -c(B.Code, A.Level.Name, is_entire_diet)) %>%
dplyr::select(-is_entire_diet) # ← drop flag here
# ── 7) Merge nutrition & digestibility with feed_intake_wide --------------
# Split based on is_entire_diet
fi_entire <- feed_intake_wide %>% filter(is_entire_diet == TRUE)
fi_partial <- feed_intake_wide %>%
filter(is_entire_diet == FALSE | is.na(is_entire_diet))
# Join each subset separately
fi_entire <- fi_entire %>%
left_join(nutrition_wide, by = c("B.Code", "A.Level.Name")) %>%
left_join(digestibility_wide, by = c("B.Code", "A.Level.Name"))
fi_partial <- fi_partial %>%
left_join(nutrition_wide, by = c("B.Code", "ED.Intake.Item" = "A.Level.Name")) %>%
left_join(digestibility_wide, by = c("B.Code", "ED.Intake.Item" = "A.Level.Name"))
# Combine both and clean up
feed_intake_wide <- bind_rows(fi_entire, fi_partial)
# (Optional) expose unit look‑ups ------------------------------------------------
# attr(nutrition_wide, "units")
# attr(digestibility_wide, "units")
This interactive table displays the complete merged dataset, now
enriched with outcome metrics and feed characteristics. Each row
corresponds to a treatment or diet observation, with harmonized units
and identifiers. The ED.Intake.Item
field shows whether the
row corresponds to an entire diet or a specific feed ingredient.
feed_intake_wide <- feed_intake_wide %>%
mutate(
ED.Intake.Item = if_else(
ED.Intake.Item == A.Level.Name,
"Entire Diet",
ED.Intake.Item
),
is_entire_diet = if_else(
is.na(is_entire_diet),
FALSE,
is_entire_diet
)
)
datatable(
feed_intake_wide,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-weight: bold;",
"Feed Intake Data — Wide Format with Diet Classification"
),
options = list(
pageLength = 10,
scrollX = TRUE, # Enable horizontal scroll for wide tables
autoWidth = TRUE
),
rownames = FALSE
)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
To guide model design, we assess which combinations of variables are commonly reported together. The function below builds summary tables showing how many studies and experimental units report key variable pairs (e.g., BW + DMD, BW + CP). This allows us to evaluate which variable combinations are best suited for predicting feed intake.
These tables summarize data coverage across species for different combinations of predictor variables. Each “combo” shows how many studies include the required data on the same row. This is essential for determining feasible input combinations when building estimation models for each species.
# ---------- Helper: strict coverage on ROWS (same observation) --------------
build_coverage_strict <- function(df, exp_units = c("max","sum")) {
exp_units <- match.arg(exp_units)
rows <- df %>%
mutate(
units = suppressWarnings(as.numeric(T.Animals)),
has_bw = !is.na(bw_kg),
has_milk = !is.na(milk_kg_day),
has_dmd = !is.na(DM_digest),
has_cp = !is.na(CP_nutrition),
has_ndf = !is.na(NDF_nutrition)
)
combos <- tribble(
~Combo, ~need_bw, ~need_milk, ~need_dmd, ~need_cp, ~need_ndf,
"Combo 1", TRUE, FALSE, TRUE, FALSE, FALSE, # BW + DMD
"Combo 2", FALSE, TRUE, TRUE, FALSE, FALSE, # Milk + DMD
"Combo 3", TRUE, TRUE, TRUE, FALSE, FALSE, # BW + Milk + DMD
"Combo 4", TRUE, FALSE, FALSE, TRUE, FALSE, # BW + CP
"Combo 5", TRUE, FALSE, FALSE, FALSE, TRUE, # BW + NDF
"Combo 6", TRUE, FALSE, FALSE, TRUE, TRUE # BW + CP + NDF
)
qualifies <- function(x, bw, milk, dmd, cp, ndf) {
ok <- rep(TRUE, nrow(x))
if (bw) ok <- ok & x$has_bw
if (milk) ok <- ok & x$has_milk
if (dmd) ok <- ok & x$has_dmd
if (cp) ok <- ok & x$has_cp
if (ndf) ok <- ok & x$has_ndf
ok
}
purrr::pmap_dfr(combos, function(Combo, need_bw, need_milk, need_dmd, need_cp, need_ndf) {
row_idx <- qualifies(rows, need_bw, need_milk, need_dmd, need_cp, need_ndf)
# studies with at least one qualifying row
studies_n <- n_distinct(rows$B.Code[row_idx])
# experimental units: per-study max (or sum) across qualifying rows
units_sum <- rows[row_idx, ] %>%
group_by(B.Code) %>%
summarise(
u = if (exp_units == "max")
if (all(is.na(units))) NA_real_ else max(units, na.rm = TRUE)
else
sum(units, na.rm = TRUE),
.groups = "drop"
) %>%
summarise(sum = sum(u, na.rm = TRUE), .groups = "drop") %>%
pull(sum)
tibble(
Combo = Combo,
`Bodyweight` = ifelse(need_bw, "X", ""),
`Milk yield` = ifelse(need_milk, "X", ""),
`DMD` = ifelse(need_dmd, "X", ""),
`CP` = ifelse(need_cp, "X", ""),
`NDF` = ifelse(need_ndf, "X", ""),
Studies = studies_n,
`Experimental units` = units_sum
)
}) %>%
mutate(Combo = paste0("Combo ", row_number()))
}
# ---------- Build tables for each species (strict) --------------------------
fi_cattle <- feed_intake_wide %>% filter(!is.na(Species) & str_detect(tolower(Species), "cattle"))
fi_goats <- feed_intake_wide %>% filter(!is.na(Species) & str_detect(tolower(Species), "goat"))
fi_sheep <- feed_intake_wide %>% filter(!is.na(Species) & str_detect(tolower(Species), "sheep"))
cov_cattle_strict <- build_coverage_strict(fi_cattle, exp_units = "max") # or "sum"
cov_goats_strict <- build_coverage_strict(fi_goats, exp_units = "max")
cov_sheep_strict <- build_coverage_strict(fi_sheep, exp_units = "max")
# ---------- Print -----------------------------------------------------------
kable(cov_cattle_strict, caption = "Coverage (Cattle) — strict, same-row variables")
Combo | Bodyweight | Milk yield | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
Combo 1 | X | X | 8 | 57 | |||
Combo 2 | X | X | 13 | 70 | |||
Combo 3 | X | X | X | 3 | 13 | ||
Combo 4 | X | X | 27 | 173 | |||
Combo 5 | X | X | 18 | 113 | |||
Combo 6 | X | X | X | 18 | 113 |
kable(cov_goats_strict, caption = "Coverage (Goats) — strict, same-row variables")
Combo | Bodyweight | Milk yield | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
Combo 1 | X | X | 39 | 199 | |||
Combo 2 | X | X | 8 | 45 | |||
Combo 3 | X | X | X | 2 | 13 | ||
Combo 4 | X | X | 57 | 308 | |||
Combo 5 | X | X | 55 | 299 | |||
Combo 6 | X | X | X | 53 | 292 |
kable(cov_sheep_strict, caption = "Coverage (Sheep) — strict, same-row variables")
Combo | Bodyweight | Milk yield | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
Combo 1 | X | X | 51 | 258 | |||
Combo 2 | X | X | 2 | 13 | |||
Combo 3 | X | X | X | 0 | 0 | ||
Combo 4 | X | X | 76 | 408 | |||
Combo 5 | X | X | 69 | 374 | |||
Combo 6 | X | X | X | 69 | 374 |
Here, we generate more specific coverage tables evaluating which subsets of variables co-occur in the same records. These stricter filters help refine modeling strategies for distinct livestock types.
# ---------- helper: Table 2 (strict, same-row variables) ----------
build_table2 <- function(df, exp_units = c("max","sum")) {
exp_units <- match.arg(exp_units)
rows <- df %>%
mutate(
units = suppressWarnings(as.numeric(T.Animals)),
has_bw = !is.na(bw_kg), # Bodyweight
has_adg = !is.na(adg_g_day), # Daily gain
has_dmd = !is.na(DM_digest), # DMD
has_cp = !is.na(CP_nutrition), # CP
has_ndf = !is.na(NDF_nutrition) # NDF
)
defs <- tribble(
~Combo, ~need_bw, ~need_adg, ~need_dmd, ~need_cp, ~need_ndf,
"DMD+BW", TRUE, FALSE, TRUE, FALSE, FALSE,
"BW+DG+DMD", TRUE, TRUE, TRUE, FALSE, FALSE,
"BW+CP", TRUE, FALSE, FALSE, TRUE, FALSE,
"BW+CP+ADG", TRUE, TRUE, FALSE, TRUE, FALSE
)
qualifies <- function(x, bw, adg, dmd, cp, ndf) {
ok <- rep(TRUE, nrow(x))
if (bw) ok <- ok & x$has_bw
if (adg) ok <- ok & x$has_adg
if (dmd) ok <- ok & x$has_dmd
if (cp) ok <- ok & x$has_cp
if (ndf) ok <- ok & x$has_ndf
ok
}
purrr::pmap_dfr(defs, function(Combo, need_bw, need_adg, need_dmd, need_cp, need_ndf) {
row_idx <- qualifies(rows, need_bw, need_adg, need_dmd, need_cp, need_ndf)
# studies with at least one qualifying row
studies_n <- n_distinct(rows$B.Code[row_idx])
# experimental units: per-study max (or sum) across qualifying rows
units_sum <- rows[row_idx, ] %>%
group_by(B.Code) %>%
summarise(
u = if (exp_units == "max")
if (all(is.na(units))) NA_real_ else max(units, na.rm = TRUE)
else
sum(units, na.rm = TRUE),
.groups = "drop"
) %>%
summarise(total = sum(u, na.rm = TRUE), .groups = "drop") %>%
pull(total)
tibble(
Combo = Combo,
`Bodyweight` = ifelse(need_bw, "X", ""),
`Daily gain` = ifelse(need_adg, "X", ""),
`DMD` = ifelse(need_dmd, "X", ""),
`CP` = ifelse(need_cp, "X", ""),
`NDF` = ifelse(need_ndf, "X", ""),
Studies = studies_n,
`Experimental units` = units_sum
)
})
}
# ---------- subsets ----------
# Cattle (non‑dairy): keep varieties typically Zebu/Beef (adjust pattern to your data)
fi_cattle_nd <- feed_intake_wide %>%
filter(!is.na(Species) & str_detect(tolower(Species), "cattle")) %>%
filter(str_detect(tolower(Variety),
"zebu|beef|boran|brahman|sanga|nguni|bunaji|fulani|ankole|afrikaner"))
# Goats and Sheep
fi_goats <- feed_intake_wide %>%
filter(!is.na(Species) & str_detect(tolower(Species), "goat"))
fi_sheep <- feed_intake_wide %>%
filter(!is.na(Species) & str_detect(tolower(Species), "sheep"))
# ---------- build tables ----------
tab2_cattle <- build_table2(fi_cattle_nd, exp_units = "max")
tab2_goats <- build_table2(fi_goats, exp_units = "max")
tab2_sheep <- build_table2(fi_sheep, exp_units = "max")
# ---------- print ----------
kable(tab2_cattle, caption = "Table 2 — Non‑dairy cattle (Zebu/Beef): strict, same-row variables")
Combo | Bodyweight | Daily gain | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
DMD+BW | X | X | 4 | 48 | |||
BW+DG+DMD | X | X | X | 4 | 48 | ||
BW+CP | X | X | 10 | 84 | |||
BW+CP+ADG | X | X | X | 10 | 84 |
kable(tab2_goats, caption = "Table 2 — Goats: strict, same-row variables")
Combo | Bodyweight | Daily gain | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
DMD+BW | X | X | 39 | 199 | |||
BW+DG+DMD | X | X | X | 39 | 199 | ||
BW+CP | X | X | 57 | 308 | |||
BW+CP+ADG | X | X | X | 57 | 308 |
kable(tab2_sheep, caption = "Table 2 — Sheep: strict, same-row variables")
Combo | Bodyweight | Daily gain | DMD | CP | NDF | Studies | Experimental units |
---|---|---|---|---|---|---|---|
DMD+BW | X | X | 51 | 258 | |||
BW+DG+DMD | X | X | X | 51 | 258 | ||
BW+CP | X | X | 76 | 408 | |||
BW+CP+ADG | X | X | X | 76 | 408 |