This notebook assembles and cleans heterogeneous livestock-nutrition trials into a single, analysis-ready dataset that can be used to quantify enteric-methane (CH₄) emissions and emission-intensity for cattle, sheep and goats. Using R, we use ERA metadata, harmonise units, fill nutritional gaps (five-step cascade: original paper → Feedipedia → ILRI feeds DB → proximate-composition equation of Weiss & Tebbe → ingredient means), and reconstruct diet-level gross energy (GE) where necessary.
Daily GE intake is converted to annual CH₄ with the IPCC Tier 2 equation. Methane-conversion factors (Ym) follow the 2019 IPCC defaults but are adjusted for diet forage level and dairy productivity where the data allow. Absolute emissions are then paired with weight-gain and milk-/meat-yield records to derive emission-intensity (kg CH₄ kg-product⁻¹). A change-detection routine classifies every treatment diet relative to its control (addition, substitution, dosage change) so diet strategies can be linked directly to shifts in EI.
Ruminants in sub-Saharan Africa often operate far below potential productivity; improving feed quality can therefore lower both absolute and per-kilogram emissions. By turning hundred of individual studies into a coherent Tier-2 dataset, this pipeline provides a transparent basis for (i) benchmarking baseline emissions, (ii) identifying high-leverage nutritional interventions, and (iii) stress-testing alternative CH₄ equations or region-specific Ym values.
A detailed methodological rationale is given in Livestock Emission Method.docx.
This section installs and loads required packages, connects to the S3 bucket containing ERA data, and loads both the livestock and camel metadata into memory.
# Install and load 'pacman' package manager if not already installed
library(ERAg)
library(stringr)
library(knitr)
library(dplyr)
library(DT)
library(readxl)
library(tidyr)
library(arrow)
library(ggplot2)
library(data.table)
library(purrr)
library(janitor)
library(ggalt)
# Set up connection to S3 bucket
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"
# -------------------------
# Load LATEST skinny_cow
# -------------------------
# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3, "data"))
# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData$", grep("skinny_cow_2022", files, value = TRUE), value = TRUE), 1)
# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if (!file.exists(save_path)) {
s3$file_download(files, save_path, overwrite = TRUE)
}
# Load the data (as a list of objects)
livestock_metadata <- miceadds::load.Rdata2(file = basename(save_path), path = dirname(save_path))
# -------------------------
# Build D.Item -> D.Type lookup from 2025-04-11.1 (source-of-truth)
# -------------------------
src_file <- file.path(era_s3, "data/skinny_cow_2022-2025-04-11.1.RData")
src_local <- file.path(getwd(), basename(src_file))
if (!file.exists(src_local)) {
s3$file_download(src_file, src_local, overwrite = TRUE)
}
src_obj <- miceadds::load.Rdata2(file = basename(src_local), path = dirname(src_local))
# Ensure data.table
lookup_dt <- as.data.table(src_obj$Animals.Diet)[
, .(D.Item = str_squish(as.character(D.Item)),
D.Type = str_squish(as.character(D.Type)))
][
!is.na(D.Item) & nzchar(D.Item) & !is.na(D.Type) & nzchar(D.Type)
]
# Resolve any (D.Item, D.Type) duplicates by taking the most frequent D.Type per D.Item
lookup_dt <- lookup_dt[
, .N, by = .(D.Item, D.Type)
][
order(D.Item, -N)
][
, .SD[1], by = D.Item
][
, .(D.Item, D.Type)
]
# -------------------------
# Backfill D.Type into latest Animals.Diet
# -------------------------
diet_latest <- as.data.table(livestock_metadata$Animals.Diet)
# Normalize join key to match lookup normalization
diet_latest[, D.Item := str_squish(as.character(D.Item))]
# Add D.Type column if missing
if (!"D.Type" %in% names(diet_latest)) {
diet_latest[, D.Type := NA_character_]
} else {
diet_latest[, D.Type := as.character(D.Type)]
}
# Join and fill only missing D.Type
diet_latest <- merge(
diet_latest, lookup_dt,
by = "D.Item", all.x = TRUE, suffixes = c("", ".src")
)
diet_latest[is.na(D.Type) & !is.na(D.Type.src), D.Type := D.Type.src]
diet_latest[, D.Type.src := NULL]
# Write back into the loaded object
livestock_metadata$Animals.Diet <- diet_latest
# Set up connection to S3 bucket for camel data
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"
# List RData files related to courageous_camel_2024
camel_files <- s3$dir_ls(file.path(era_s3, "data"))
camel_files <- grep("courageous_camel_2024_draft_.*\\.RData$", camel_files, value = TRUE)
# Helper function to extract date and version
parse_versions <- function(fnames) {
base <- sub(".*_draft_", "", fnames)
base <- sub("\\.RData$", "", base)
date_part <- sub("\\..*$", "", base)
version_part <- ifelse(grepl("\\.", base),
as.integer(sub("^.*\\.", "", base)),
0L)
data.frame(
file = fnames,
date = as.Date(date_part),
version = version_part,
stringsAsFactors = FALSE
)
}
# Parse file names and determine latest
version_info <- parse_versions(basename(camel_files))
latest <- version_info[order(version_info$date, version_info$version, decreasing = TRUE), ][1, ]
latest_file <- camel_files[basename(camel_files) == latest$file]
# Download the file if not already present
save_path <- file.path(getwd(), basename(latest_file))
if (!file.exists(save_path)) {
s3$file_download(latest_file, save_path, overwrite = TRUE)
}
# Load the data
camel <- miceadds::load.Rdata2(file = basename(save_path), path = dirname(save_path))
missing_unit_feed_intake<- camel$Data.Out %>%
select(B.Code, T.Name, ED.Intake.Item,
ED.Mean.T, ED.Error, Out.Unit, Out.Subind,Out.Code.Joined) %>%
filter(Out.Subind == "Feed Intake") %>%
filter(is.na(Out.Unit))
fill in feed intakes
# #Add units where they are NA for Feed-Intake rows
# camel$Data.Out[
# Out.Subind == "Feed Intake" & is.na(Out.Unit), # target rows
# Out.Unit := sub("^.*?\\.\\.(.*?)\\.\\..*$", "\\1", Out.Code.Joined) # extract middle chunk
# ]
#
# feed_intake<-camel$Data.Out%>%
# filter(Out.Subind=="Feed Intake")
#
# sort(unique(feed_intake$Out.Unit), na.last = TRUE) # A-Z, NAs at the end
##############################################################################
# Canonical column templates (ALL use a D.Item field) ------------------------
##############################################################################
diet_cols <- c("B.Code","A.Level.Name","D.Item",
"D.Type","D.Amount","D.Unit.Amount","D.Unit.Time",
"D.Unit.Animals","DC.Is.Dry","D.Ad.lib","D.Item.Group")
diet_cols_ext <- c(diet_cols, "D.Item.Root.Comp.Proc_Major") # keep root too
mt_cols <- c("B.Code","A.Level.Name","V.Level.Name",
"T.Animals","T.Name","T.Control")
comp_cols <- c("B.Code","D.Item","is_group","is_entire_diet",
"DC.Variable","DC.Value","DC.Unit")
digest_cols <- c("B.Code","D.Item","is_group","is_entire_diet",
"DD.Variable","DD.Value","DD.Unit")
data_cols <- c("B.Code","A.Level.Name","Out.Code.Joined",
"ED.Intake.Item","ED.Intake.Item.Raw",
"ED.Mean.T","ED.Error","Out.Subind","Out.Unit",
"Out.WG.Start","Out.WG.Unit",
"is_entire_diet","is_group","T.Animals","T.Name")
prod_cols <- c("B.Code","P.Product","P.Variety","P.Practice")
##############################################################################
# 1. LIVESTOCK tables --------------------------------------------------------
##############################################################################
## 1·1 Diet ------------------------------------------------------------------
diet_livestock <- copy(livestock_metadata$Animals.Diet)[
, ..diet_cols_ext][, Source := "livestock"]
## 1·2 Lookup table : (B.Code , FeedName) ➜ Canonical root ---------------
lookup_ditem <- rbindlist(list(
diet_livestock[
!is.na(D.Item) &
!is.na(D.Item.Root.Comp.Proc_Major),
.(B.Code,
FeedName = D.Item,
Canonical = D.Item.Root.Comp.Proc_Major)],
diet_livestock[
!is.na(D.Item.Root.Comp.Proc_Major),
.(B.Code,
FeedName = D.Item.Root.Comp.Proc_Major,
Canonical = D.Item.Root.Comp.Proc_Major)]
))[
, .SD[1], by = .(B.Code, FeedName)]
setkey(lookup_ditem, B.Code, FeedName)
## 1·3 MT --------------------------------------------------------------------
mt_cols_ls <- intersect(mt_cols, names(livestock_metadata$MT.Out))
mt_livestock <- livestock_metadata$MT.Out[, ..mt_cols_ls][
, `:=`(Out.WG.Start = NA_real_,
Out.WG.Unit = NA_character_,
Source = "livestock")]
## 1·4 Composition -----------------------------------------------------------
comp_livestock <- merge(
livestock_metadata$Animals.Diet.Comp,
lookup_ditem,
by.x = c("B.Code","D.Item"),
by.y = c("B.Code","FeedName"),
all.x = TRUE, allow.cartesian = TRUE
)[
, D.Item := fcoalesce(Canonical, D.Item)
][
, Canonical := NULL
][
, ..comp_cols
][, Source := "livestock"]
## 1·5 Digestibility ---------------------------------------------------------
digest_livestock <- merge(
livestock_metadata$Animals.Diet.Digest,
lookup_ditem,
by.x = c("B.Code","D.Item"),
by.y = c("B.Code","FeedName"),
all.x = TRUE, allow.cartesian = TRUE
)[
, D.Item := fcoalesce(Canonical, D.Item)
][
, Canonical := NULL
][
, ..digest_cols
][, Source := "livestock"]
## 1·6 Data.Out (canonicalise Intake item) ----------------------------------
# Canonicalise Intake item in Data.Out and keep legacy column names expected by data_cols
dt <- merge(
livestock_metadata$Data.Out,
lookup_ditem,
by.x = c("B.Code","ED.Intake.Item"),
by.y = c("B.Code","FeedName"),
all.x = TRUE, allow.cartesian = TRUE
)
# Use Canonical when present, then drop it
dt[, ED.Intake.Item := fcoalesce(Canonical, ED.Intake.Item)]
dt[, Canonical := NULL]
# Map new column names back to old ones (so downstream ..data_cols still works)
if ("ED.Intake.Item.is_group" %in% names(dt) && !"is_group" %in% names(dt)) {
setnames(dt, "ED.Intake.Item.is_group", "is_group")
}
if ("ED.Intake.Item.is_entire_diet" %in% names(dt) && !"is_entire_diet" %in% names(dt)) {
setnames(dt, "ED.Intake.Item.is_entire_diet", "is_entire_diet")
}
# Now select the standard columns and tag the source
data_livestock <- dt[, ..data_cols][, Source := "livestock"]
## 1·7 Product ---------------------------------------------------------------
var_ls <- livestock_metadata$Var.Out[
, .(P.Variety = first(na.omit(V.Level.Name)),
P.Practice = first(na.omit(V.Animal.Practice))), by = B.Code]
prod_livestock <- livestock_metadata$Prod.Out[
, .(B.Code, P.Product)][var_ls, on = "B.Code"][
, Source := "livestock"]
##############################################################################
# 2. CAMEL helper keys (unchanged) ------------------------------------------
##############################################################################
diet_key <- camel$MT.Out[
, .(B.Code, T.Name, A.Level.Name, T.Control,
Herd.Level.Name, Herd.Sublevel.Name)]
herd_info <- camel$Herd.Out[
, .(B.Code, Herd.Level.Name, Herd.Sublevel.Name,
Herd.Sex, Herd.Stage,
V.Var, V.Animal.Practice,
T.Animals = Herd.N,
Herd.Start.Weight = Herd.Start.Weight,
Herd.Start.Weight.Unit = Herd.Start.Weight.Unit)]
##############################################################################
# 3. CAMEL tables – use root when present, fallback to D.Item --------------
##############################################################################
## 3·1 Diet ------------------------------------------------------------------
diet_camel <- copy(camel$Animal.Diet)[
, D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
, ..diet_cols
][, Source := "camel"]
## 3·2 Composition -----------------------------------------------------------
comp_camel <- copy(camel$Animal.Diet.Comp)[
, D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
, `:=`(DC.Variable = DN.Variable,
DC.Value = DN.Value,
DC.Unit = DN.Unit)
][
, ..comp_cols
][, Source := "camel"]
## 3·3 Digestibility ---------------------------------------------------------
digest_camel <- copy(camel$Animal.Diet.Digest)[
, D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
, ..digest_cols
][, Source := "camel"]
## 3·4 MT.Out (unchanged) ----------------------------------------------------
mt_camel <- diet_key[
herd_info, on = .(B.Code,Herd.Level.Name,Herd.Sublevel.Name), nomatch = 0L][
camel$MT.Out[
, .(B.Code, A.Level.Name, T.Name,
T.Start.Weight,
T.Start.Weight.Unit = `T.Start.Weight. Unit`)],
on = .(B.Code,A.Level.Name,T.Name), nomatch = 0L][
, `:=`(T.Start.Weight = as.numeric(T.Start.Weight),
Herd.Start.Weight = as.numeric(Herd.Start.Weight),
T.Start.Weight.Unit = as.character(T.Start.Weight.Unit),
Herd.Start.Weight.Unit = as.character(Herd.Start.Weight.Unit))][
, `:=`(Out.WG.Start = fcoalesce(T.Start.Weight, Herd.Start.Weight),
Out.WG.Unit = fifelse(!is.na(T.Start.Weight),
T.Start.Weight.Unit,
Herd.Start.Weight.Unit),
Source = "camel")][
, !c("T.Start.Weight","T.Start.Weight.Unit",
"Herd.Start.Weight","Herd.Start.Weight.Unit")]
## 3·5 Data.Out (unchanged) --------------------------------------------------
## 3·5 Data.Out (left-join + boolean-only Entire Diet) ------------------------
# Keep ALL rows from camel$Data.Out; attach A.Level.Name via (B.Code, T.Name),
# then attach herd fields via (B.Code, Herd.Level.Name, Herd.Sublevel.Name).
data_camel <- merge(
camel$Data.Out,
diet_key,
by = c("B.Code","T.Name"),
all.x = TRUE
)
data_camel <- merge(
data_camel,
herd_info,
by = c("B.Code","Herd.Level.Name","Herd.Sublevel.Name"),
all.x = TRUE
)
# Select/rename to your schema and enforce the boolean Entire Diet rule
data_camel <- data_camel[
, .(B.Code,
A.Level.Name,
ED.Intake.Item,
T.Name,
D.Item.Root.Comp.Proc_Major,
ED.Intake.Item.is_entire_diet,
ED.Intake.Item.Raw = ED.Intake.Item,
ED.Mean.T, ED.Error, Out.Unit, Out.Subind,
Out.WG.Start = as.numeric(Herd.Start.Weight),
Out.WG.Unit = as.character(Herd.Start.Weight.Unit),
T.Animals)
][
# Boolean-only tagging: if camel says entire diet, force the token
, ED.Intake.Item := fifelse(ED.Intake.Item.is_entire_diet == TRUE,
"Entire Diet", ED.Intake.Item)
][
, Source := "camel"
]
prod_camel <- camel$Herd.Out[
, .(B.Code,
P.Product = V.Product,
P.Variety = V.Var,
P.Practice = V.Animal.Practice,
Herd.Stage)][, Source := "camel"]
##############################################################################
# 4. Combine camel + livestock ----------------------------------------------
##############################################################################
diet_livestock_final <- diet_livestock[
, `:=`(D.Item = D.Item.Root.Comp.Proc_Major,
D.Item.Root.Comp.Proc_Major = NULL)]
diet_all <- rbindlist(list(diet_livestock_final, diet_camel), fill = TRUE)
mt_all <- rbindlist(list(mt_livestock, mt_camel), fill = TRUE)
comp_all <- rbindlist(list(comp_livestock, comp_camel), fill = TRUE)
digest_all <- rbindlist(list(digest_livestock, digest_camel), fill = TRUE)
data_all <- rbindlist(list(data_livestock, data_camel), fill = TRUE)
prod_all <- rbindlist(list(prod_livestock, prod_camel), fill = TRUE)
##############################################################################
# 5. Master list -------------------------------------------------------------
##############################################################################
merged_metadata <- list(
Animals.Diet = diet_all,
MT.Out = mt_all,
Animals.Diet.Comp = comp_all,
Animals.Diet.Digest = digest_all,
Data.Out = data_all,
Prod.Out = prod_all
)
merged_metadata$Data.Out <- merged_metadata$Data.Out %>%
mutate(
## 0 ────────────────────────────────────────────────────────────────
## normalize any variant spelling to a single "Entire Diet" token
ED.Intake.Item = case_when(
grepl("^\\s*entire\\s*diet\\s*$", tolower(coalesce(ED.Intake.Item, ""))) ~ "Entire Diet",
TRUE ~ ED.Intake.Item
),
## 1 ────────────────────────────────────────────────────────────────
## overwrite with root ONLY for ingredient rows (never for "Entire Diet")
ED.Intake.Item = if_else(
!is.na(D.Item.Root.Comp.Proc_Major) & D.Item.Root.Comp.Proc_Major != "" &
!grepl("^Entire Diet$", coalesce(ED.Intake.Item, "")),
D.Item.Root.Comp.Proc_Major,
ED.Intake.Item
),
## 2 ────────────────────────────────────────────────────────────────
## if still NA/blank, use T.Name
ED.Intake.Item = if_else(
(is.na(ED.Intake.Item) | ED.Intake.Item == "") &
!is.na(T.Name) & T.Name != "",
T.Name,
ED.Intake.Item
),
## 3 ────────────────────────────────────────────────────────────────
## set entire-diet flag from helper OR from the token
is_entire_diet = case_when(
!is.na(ED.Intake.Item.is_entire_diet) & ED.Intake.Item.is_entire_diet != "" ~ ED.Intake.Item.is_entire_diet,
grepl("^Entire Diet$", coalesce(ED.Intake.Item, "")) ~ TRUE,
TRUE ~ is_entire_diet
)
)
# 1) Build a lookup from MT with squished whitespace
mt_key <- as.data.table(camel$MT.Out)[
, .(B.Code,
T.Name_core = str_squish(as.character(T.Name)),
A.Level.Name = str_squish(as.character(A.Level.Name)))
][
# keep one row per (B.Code, T.Name_core), preferring non-NA A.Level.Name
order(!is.na(A.Level.Name))
][
, .SD[.N], by = .(B.Code, T.Name_core)
]
# 2) Prepare Data.Out: derive a "core" T.Name by stripping anything after "||"
dt <- as.data.table(merged_metadata$Data.Out)
dt[Source == "camel",
T.Name_core := str_squish(sub("\\|\\|.*$", "", as.character(T.Name)))]
# 3) Left join to fill A.Level.Name where NA
dt[Source == "camel" & (is.na(A.Level.Name) | A.Level.Name == ""),
A.Level.Name := mt_key[.SD, on = .(B.Code, T.Name_core), A.Level.Name]
]
# 4) Clean up helper
dt[, T.Name_core := NULL]
# 5) Write back
merged_metadata$Data.Out <- dt
## Keep only merged_metadata (add more names to the vector if needed)
rm(list = setdiff(ls(), c("merged_metadata")))
gc() # optional: reclaim memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3738816 199.7 7526624 402.0 4590741 245.2
## Vcells 7354809 56.2 19812323 151.2 19811308 151.2
When t control is yes for one of the diets then automatically enter No for the other diets of the same paper. Also for missing t control if the name of the diet contains control or Con or Zero then automatically deduce this is a control diet and others are treatment diets.
MT <- copy(merged_metadata$MT.Out)
MT[, T.Control := tolower(trimws(as.character(T.Control))) ]
MT[
## diet name suggests a control …
grepl("\\bzero\\b|\\bcon\\b|control", A.Level.Name, ignore.case = TRUE) &
## … but flag is still empty / ambiguous
(is.na(T.Control) | T.Control %in% c("0", "false","na")),
T.Control := "yes"
]
MT[, T.Control := {
has_ctrl <- any(T.Control == "yes", na.rm = TRUE) # does study have a control?
if (has_ctrl) {
# 4·a normalise obvious “no” variants
flag <- fifelse(
T.Control %in% c("false", "0", "n", "no"), "no", T.Control
)
# 4·b ensure exactly one category: yes / no
fifelse(flag == "yes", "yes", "no")
} else {
# keep whatever was there (even "0", "false", NA, …)
T.Control
}
}, by = B.Code]
MT[, T.Control := fifelse(
(is.na(T.Control) | T.Control %in% c("", "na")) & any(T.Control == "yes", na.rm = TRUE),
"no",
T.Control
), by = B.Code]
merged_metadata$MT.Out <- MT
controls_multi <- MT[
T.Control == "yes",
.SD[uniqueN(A.Level.Name) > 1],
by = B.Code
]
controls_multi[, .(B.Code, A.Level.Name, T.Control)]
## B.Code A.Level.Name T.Control
## <char> <char> <char>
## 1: HK0287 Conc Commercial...Concentrate HM yes
## 2: HK0287 Conc Commercial yes
## 3: HK0287 Concentrate HM yes
## 4: JS0340 MSP0 Enzyme...MSP0 Yeast yes
## 5: JS0340 MSP0 Enzyme yes
## ---
## 261: JO1088 100g MOBYS RS yes
## 262: JO1088 200g MOBYS RS yes
## 263: JO1088 300g MOBYS RS yes
## 264: JO1095 Diet 1 yes
## 265: JO1095 Diet 5 yes
rm(controls_multi,MT)
Some feed intake entries are not matched to any ingredient data due to discrepancies in A.Level.Name between the feed intake and ingredient datasets for the same study codes. In this section, we build a table to flag studies where diet names do not align, helping to identify cases that may require manual review or correction.
# --- Diet name alignment across Feed Intake, Diet Comp, and Digest -----------
# Uses Ingredient (Animals.Diet$A.Level.Name) as the canonical diet name.
# In Diet Comp and Digest, the diet name lives in D.Item, and we only consider rows with is.entire.diet == TRUE.
# -------- Helpers -------------------------------------------------------------
clean_name <- function(x) x %>% str_squish() # add tolower() if you want case-insensitive matching
# Build a safe 1–1 mapping per B.Code between Ingredient name and a target source name.
# Expects data.frames with columns: B.Code and Diet (already cleaned).
build_mapping <- function(ingredient_df, target_df, target_label) {
combined <- bind_rows(
ingredient_df %>% mutate(Source = "Ingredient"),
target_df %>% mutate(Source = target_label)
) %>% distinct()
comparison <- combined %>%
group_by(B.Code, Diet) %>%
summarise(Sources = paste(sort(unique(Source)), collapse = " & "), .groups = "drop") %>%
mutate(
Match_Status = dplyr::case_when(
grepl(target_label, Sources) & grepl("Ingredient", Sources) ~ "Match",
Sources == "Ingredient" ~ "Only in Ingredient",
Sources == target_label ~ paste("Only in", target_label),
TRUE ~ "Other"
)
)
comparison %>%
filter(Match_Status %in% c("Only in Ingredient", paste("Only in", target_label))) %>%
group_by(B.Code) %>%
summarise(
n_ing = sum(Match_Status == "Only in Ingredient"),
n_tar = sum(Match_Status == paste("Only in", target_label)),
Ingredient_Name = Diet[Match_Status == "Only in Ingredient"][1],
Target_Name = Diet[Match_Status == paste("Only in", target_label)][1],
.groups = "drop"
) %>%
filter(n_ing == 1, n_tar == 1)
}
# -------- 1) Collect cleaned diet names from each source ----------------------
ingredient_names <- merged_metadata$Animals.Diet %>%
transmute(B.Code, Diet = clean_name(A.Level.Name)) %>%
distinct()
feedintake_names <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
transmute(B.Code, Diet = clean_name(A.Level.Name)) %>%
distinct()
dietcomp_names <- merged_metadata$Animals.Diet.Comp %>%
filter(is_entire_diet == TRUE) %>%
transmute(B.Code, Diet = clean_name(D.Item)) %>%
distinct()
digest_names <- merged_metadata$Animals.Diet.Digest %>%
filter(is_entire_diet == TRUE) %>%
transmute(B.Code, Diet = clean_name(D.Item)) %>%
distinct()
# Presence BEFORE corrections (per (B.Code, Diet))
presence_before <- bind_rows(
ingredient_names %>% mutate(Source = "Ingredient"),
feedintake_names %>% mutate(Source = "FeedIntake"),
dietcomp_names %>% mutate(Source = "DietComp"),
digest_names %>% mutate(Source = "Digest")
) %>%
mutate(val = 1L) %>%
pivot_wider(names_from = Source, values_from = val, values_fill = 0L) %>%
arrange(B.Code, Diet)
# -------- 2) Build 1–1 mappings vs Ingredient --------------------------------
to_correct_FI <- build_mapping(ingredient_names, feedintake_names, "FeedIntake")
to_correct_DC <- build_mapping(ingredient_names, dietcomp_names, "DietComp")
to_correct_DG <- build_mapping(ingredient_names, digest_names, "Digest")
# Optional: preview reports of what WILL change (before applying)
corrections_report_FI <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
mutate(A.Level.Name_clean = clean_name(A.Level.Name)) %>%
left_join(to_correct_FI, by = c("B.Code", "A.Level.Name_clean" = "Target_Name")) %>%
transmute(B.Code, Old = A.Level.Name, New = dplyr::coalesce(Ingredient_Name, A.Level.Name)) %>%
distinct() %>% filter(Old != New)
corrections_report_DC <- merged_metadata$Animals.Diet.Comp %>%
filter(is_entire_diet == TRUE) %>%
mutate(D.Item_clean = clean_name(D.Item)) %>%
left_join(to_correct_DC, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
transmute(B.Code, Old = D.Item, New = dplyr::coalesce(Ingredient_Name, D.Item)) %>%
distinct() %>% filter(Old != New)
corrections_report_DG <- merged_metadata$Animals.Diet.Digest %>%
filter(is_entire_diet == TRUE) %>%
mutate(D.Item_clean = clean_name(D.Item)) %>%
left_join(to_correct_DG, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
transmute(B.Code, Old = D.Item, New = dplyr::coalesce(Ingredient_Name, D.Item)) %>%
distinct() %>% filter(Old != New)
# -------- 3) Apply corrections (rename targets to Ingredient names) -----------
# FEED INTAKE
merged_metadata$Data.Out <- merged_metadata$Data.Out %>%
mutate(A.Level.Name_clean = clean_name(A.Level.Name)) %>%
left_join(to_correct_FI, by = c("B.Code", "A.Level.Name_clean" = "Target_Name")) %>%
mutate(A.Level.Name = dplyr::coalesce(Ingredient_Name, A.Level.Name)) %>%
select(-A.Level.Name_clean, -Ingredient_Name)
# DIET COMP (only where is.entire.diet == TRUE)
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(D.Item_clean = clean_name(D.Item)) %>%
left_join(to_correct_DC, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
mutate(
D.Item = if_else(is_entire_diet == TRUE, dplyr::coalesce(Ingredient_Name, D.Item), D.Item)
) %>%
select(-D.Item_clean, -Ingredient_Name)
# DIGEST (only where is.entire.diet == TRUE)
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
mutate(D.Item_clean = clean_name(D.Item)) %>%
left_join(to_correct_DG, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
mutate(
D.Item = if_else(is_entire_diet == TRUE, dplyr::coalesce(Ingredient_Name, D.Item), D.Item)
) %>%
select(-D.Item_clean, -Ingredient_Name)
# -------- 4) Presence AFTER corrections --------------------------------------
ingredient_names_after <- merged_metadata$Animals.Diet %>%
transmute(B.Code, Diet = clean_name(A.Level.Name)) %>% distinct()
feedintake_names_after <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
transmute(B.Code, Diet = clean_name(A.Level.Name)) %>% distinct()
dietcomp_names_after <- merged_metadata$Animals.Diet.Comp %>%
filter(is_entire_diet == TRUE) %>%
transmute(B.Code, Diet = clean_name(D.Item)) %>% distinct()
digest_names_after <- merged_metadata$Animals.Diet.Digest %>%
filter(is_entire_diet == TRUE) %>%
transmute(B.Code, Diet = clean_name(D.Item)) %>% distinct()
presence_after <- bind_rows(
ingredient_names_after %>% mutate(Source = "Ingredient"),
feedintake_names_after %>% mutate(Source = "FeedIntake"),
dietcomp_names_after %>% mutate(Source = "DietComp"),
digest_names_after %>% mutate(Source = "Digest")
) %>%
mutate(val = 1L) %>%
pivot_wider(names_from = Source, values_from = val, values_fill = 0L) %>%
arrange(B.Code, Diet)
# Optional quick summaries
alignment_summary_before <- presence_before %>%
summarise(
n_rows = n(),
full_matches = sum(Ingredient==1 & FeedIntake==1 & DietComp==1 & Digest==1),
any_missing = sum(Ingredient + FeedIntake + DietComp + Digest < 4)
)
alignment_summary_after <- presence_after %>%
summarise(
n_rows = n(),
full_matches = sum(Ingredient==1 & FeedIntake==1 & DietComp==1 & Digest==1),
any_missing = sum(Ingredient + FeedIntake + DietComp + Digest < 4)
)
rm(alignment_summary_after,alignment_summary_before,corrections_report_DC,corrections_report_DG,corrections_report_FI,dietcomp_names,dietcomp_names_after,digest_names,digest_names_after,feedintake_names,feedintake_names_after,ingredient_names,ingredient_names_after,presence_after,presence_before,to_correct_DC,to_correct_DG,to_correct_FI)
Some ingredients appear multiple times with amounts reported in different units. In this section, we prioritize entries that are already in preferred units to minimize the need for conversions.
## keep ingredient that appear twice with different units only once
# Define your unit preferences
preferred_units <- c("g", "kg")
preferred_time <- "day"
preferred_animals <- "individual"
# Prioritize and deduplicate
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
# Add a priority score to sort by best rows
mutate(
priority = case_when(
D.Unit.Amount %in% preferred_units &
D.Unit.Time == preferred_time &
D.Unit.Animals == preferred_animals ~ 1,
TRUE ~ 2 # lower priority
)
) %>%
group_by(B.Code, A.Level.Name, D.Item) %>%
slice_min(order_by = priority, with_ties = FALSE) %>%
ungroup() %>%
select(-priority) # optional: drop helper column
rm(preferred_units,preferred_time,preferred_animals)
### correct extraction errors
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(
DC.Value = ifelse(
B.Code == "BO1006" &
DC.Variable == "Ash" &
D.Item == "Ration 2" &
as.numeric(DC.Value) == 628,
6.28,
DC.Value
)
)
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")
# Initialize result containers
species_summary <- list()
bcode_table <- list()
# Loop over each species
for (species in species_list) {
# All B.Codes for this species
species_bcodes <- merged_metadata$Prod.Out %>%
filter(P.Product == species) %>%
pull(B.Code) %>%
unique()
# Feed intake B.Codes
feed_data <- merged_metadata$Data.Out %>%
filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
filter(Out.Subind=="Feed Intake")%>%
select(B.Code, D.Item = ED.Intake.Item)
feed_bcodes <- unique(feed_data$B.Code)
# GE B.Codes
ge_data <- merged_metadata$Animals.Diet.Comp %>%
filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
select(B.Code, D.Item)
ge_bcodes <- unique(ge_data$B.Code)
# Matched B.Codes (join on both B.Code and D.Item)
matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
pull(B.Code) %>%
unique()
# Store species-level summary
species_summary[[species]] <- tibble(
Species = species,
Total_BCodes = length(species_bcodes),
With_FEED = length(feed_bcodes),
With_GE = length(ge_bcodes),
With_BOTH = length(matched_bcodes)
)
# Store long-format bcode list
bcode_table[[species]] <- tibble(
B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"),
times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
Species = species
) %>% distinct()
}
## Warning in inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 315 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning in inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2814 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)
species_by_bcode <- merged_metadata$Prod.Out %>%
select(B.Code, P.Product) %>%
distinct()
# 2. Papers per species before filtering
# papers_before <- merged_metadata$Animals.Diet %>%
# select(B.Code) %>%
# distinct() %>%
# left_join(species_by_bcode, by = "B.Code") %>%
# count(P.Product, name = "papers_before")
# Step 1: Identify A.Level.Name (diet) that include Grazing
grazing_diets <- merged_metadata$Animals.Diet %>%
filter(D.Type == "Grazing") %>%
distinct(B.Code, A.Level.Name)
length(unique(grazing_diets$B.Code))
## [1] 63
# Step 2: Count how many non-grazing diets exist per B.Code
nongrazing_counts <- merged_metadata$Animals.Diet %>%
filter(!(D.Type == "Grazing")) %>%
distinct(B.Code, A.Level.Name) %>%
group_by(B.Code) %>%
summarise(n_nongrazing = n(), .groups = "drop")
length(unique(nongrazing_counts$B.Code))
## [1] 714
# Step 3: Keep only diets that are not marked as Grazing AND are in B.Codes with at least one non-grazing diet
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
anti_join(grazing_diets, by = c("B.Code", "A.Level.Name")) %>% # remove all diets that were grazing
semi_join(nongrazing_counts, by = "B.Code") # keep only papers with at least one non-grazing diet
species_by_bcode <- merged_metadata$Prod.Out %>%
select(B.Code, P.Product) %>%
distinct()
# 2. Papers per species before filtering
# papers_after <- merged_metadata$Animals.Diet %>%
# select(B.Code) %>%
# distinct() %>%
# left_join(species_by_bcode, by = "B.Code") %>%
# count(P.Product, name = "papers_after")
rm(grazing_diets,nongrazing_counts)
to_be_categorized<-merged_metadata$Animals.Diet%>%
filter(is.na(D.Item))%>%
filter(D.Type!="Entire Diet")
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")
# Initialize result containers
species_summary <- list()
bcode_table <- list()
# Loop over each species
for (species in species_list) {
# All B.Codes for this species
species_bcodes <- merged_metadata$Prod.Out %>%
filter(P.Product == species) %>%
pull(B.Code) %>%
unique()
# Feed intake B.Codes
feed_data <- merged_metadata$Data.Out %>%
filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
filter(Out.Subind=="Feed Intake")%>%
select(B.Code, D.Item = ED.Intake.Item)
feed_bcodes <- unique(feed_data$B.Code)
# GE B.Codes
ge_data <- merged_metadata$Animals.Diet.Comp %>%
filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
select(B.Code, D.Item)
ge_bcodes <- unique(ge_data$B.Code)
# Matched B.Codes (join on both B.Code and D.Item)
matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
pull(B.Code) %>%
unique()
# Store species-level summary
species_summary[[species]] <- tibble(
Species = species,
Total_BCodes = length(species_bcodes),
With_FEED = length(feed_bcodes),
With_GE = length(ge_bcodes),
With_BOTH = length(matched_bcodes)
)
# Store long-format bcode list
bcode_table[[species]] <- tibble(
B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"),
times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
Species = species
) %>% distinct()
}
# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)
This step standardizes ingredient amount units (e.g., from mg, kg, %, etc.) to a consistent base (usually grams per individual per day), ensuring reliable downstream calculations and comparisons.
# Diet ingredients: breakdown of all ingredients within each diet
ingredients <- merged_metadata$Animals.Diet %>%
mutate(D.Amount = as.numeric(D.Amount))
# Function to harmonize diet ingredient units (only change: kg/t handling)
harmonize_units <- function(df) {
df %>%
mutate(
# Record original units
Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep = ";"),
# Harmonize weight and volume units to a consistent base
D.Amount = case_when(
D.Unit.Amount %in% c("kg", "mg", "g/100g", "l", "kg/t", "g/300l",
"mg/kg", "kg/100kg", "kg/100kg body weight",
"kg/kg metabolic weight", "g/kg metabolic weight (0.75)",
"g/kg DMI", "g/L", "g/kg DM") ~ {
case_when(
D.Unit.Amount == "kg" ~ D.Amount * 1000,
D.Unit.Amount == "mg" ~ D.Amount / 1000,
D.Unit.Amount == "g/100g" ~ D.Amount * 10,
D.Unit.Amount == "l" ~ D.Amount * 1000,
D.Unit.Amount == "kg/t" ~ D.Amount,
D.Unit.Amount == "g/300l" ~ D.Amount / 300,
D.Unit.Amount == "mg/kg" ~ D.Amount / 1000,
D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ D.Amount * 10,
D.Unit.Amount == "kg/kg metabolic weight" ~ D.Amount * 1000,
D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)") ~ D.Amount,
D.Unit.Amount == "g/kg DMI" ~ D.Amount,
D.Unit.Amount == "g/kg DM" ~ D.Amount,
TRUE ~ D.Amount
)
},
TRUE ~ D.Amount
),
# Update D.Unit.Amount
D.Unit.Amount = case_when(
D.Unit.Amount == "kg" ~ "g",
D.Unit.Amount == "mg" ~ "g",
D.Unit.Amount == "g/100g" ~ "g/kg",
D.Unit.Amount == "l" ~ "ml",
D.Unit.Amount == "kg/t" ~ "g/kg",
D.Unit.Amount == "g/300l" ~ "g/L",
D.Unit.Amount == "mg/kg" ~ "g/kg",
D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ "g/kg body weight",
D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)", "g/kg metabolic weight (0.75)") ~ "g/kg metabolic weight",
D.Unit.Amount == "kg/kg metabolic weight" ~ "g/kg metabolic weight",
D.Unit.Amount == "g/kg DMI" ~ "g/kg",
D.Unit.Amount == "g/kg DM" ~ "g/kg",
D.Unit.Amount == "g/L" ~ "g/L",
D.Unit.Amount == "g/kg Body Weight" ~ "g/kg body weight",
TRUE ~ D.Unit.Amount
),
# Harmonize time units to day (excluding "individual", only changing "experiment")
D.Amount = case_when(
D.Unit.Time == "week" ~ D.Amount / 7,
D.Unit.Time == "month" ~ D.Amount / 30,
D.Unit.Time == "2 weeks" ~ D.Amount / 14,
D.Unit.Time == "4 days interval" ~ D.Amount / 4,
D.Unit.Time == "2x/day" ~ D.Amount * 2,
TRUE ~ D.Amount # Leave "experiment" and others untouched
),
D.Unit.Time = case_when(
D.Unit.Time %in% c("week", "month", "2 weeks", "4 days interval", "2x/day") ~ "day",
D.Unit.Time == "experiment" ~ D.Unit.Time,
TRUE ~ D.Unit.Time
),
# Remove units if amount is NA
D.Unit.Amount = ifelse(is.na(D.Amount), NA_character_, D.Unit.Amount),
D.Unit.Time = ifelse(is.na(D.Amount), NA_character_, D.Unit.Time),
D.Unit.Animals = ifelse(is.na(D.Amount), NA_character_, D.Unit.Animals)
)
}
# Apply harmonization
ingredients <- harmonize_units(ingredients)
# Load replicate info
reps <- merged_metadata$MT.Out %>%
mutate(T.Animals = as.numeric(T.Animals)) %>%
distinct(B.Code, A.Level.Name, .keep_all = TRUE)
# Adjust for 'all animals in replicate'
ingredients <- ingredients %>%
left_join(reps %>% dplyr::select(B.Code, A.Level.Name, T.Animals), by = c("B.Code", "A.Level.Name")) %>%
mutate(
D.Amount = case_when(
D.Unit.Animals == "all animals in replicate" & !is.na(T.Animals) ~ D.Amount / T.Animals,
TRUE ~ D.Amount
),
D.Unit.Animals = ifelse(D.Unit.Animals == "all animals in replicate", "individual", D.Unit.Animals)
)
# Refresh Units column
ingredients <- ingredients %>%
mutate(Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep = ";"))
# Cleanup
rm(harmonize_units,reps,feed_data)
This chunk systematically fills in missing “DC.Is.Dry” flags by combining three complementary sources—analytical moisture/DM thresholds, name‐based cues (e.g. “dried,” “meal”), and broad feed/supplement rules—and then, where every measurable ingredient in a treatment is already known to be dry, assumes the lab used the same dry‐prep protocol for all feeds in that trial. By layering data‑driven thresholds, text mining, and a diet‑level consistency override, we ensure that no ingredient in a truly dried regimen slips through unflagged, while still preserving fresh‐vs.‐dry distinctions in mixed rations.
# ------------------------------------------------------------
# DRY flag with strict rule:
# 1) Use Moisture/DM thresholds when available (authoritative).
# 2) If NO Moisture/DM for an item, assume DRY ("Yes").
# 3) Diet-arm propagation: if all flagged items in the arm are "Yes",
# fill any remaining NA in that arm with "Yes".
# 4) Drop "water" rows.
# No name/blanket/majority heuristics.
# ------------------------------------------------------------
# --- 1) Moisture/DM-based evidence per (B.Code, D.Item) ---------------------
dry_evidence <- bind_rows(
# a) Moisture rules
merged_metadata$Animals.Diet.Comp %>%
filter(str_to_lower(DC.Variable) %in% c("moisture","moist")) %>%
transmute(
B.Code, D.Item,
Predictor = "Moisture",
Value = suppressWarnings(as.numeric(DC.Value)),
Unit = str_to_lower(str_squish(DC.Unit)),
Dry_Prediction = dplyr::case_when(
Unit %in% c("%","% dm","g/100g") ~ dplyr::case_when(
Value < 15 ~ TRUE, # low moisture ⇒ dry
Value > 50 ~ FALSE, # very wet ⇒ not dry
TRUE ~ NA
),
Unit == "g/kg" ~ dplyr::case_when(
Value < 150 ~ TRUE,
Value > 500 ~ FALSE,
TRUE ~ NA
),
TRUE ~ NA
)
),
# b) DM rules
merged_metadata$Animals.Diet.Comp %>%
filter(str_to_lower(DC.Variable) == "dm") %>%
transmute(
B.Code, D.Item,
Predictor = "DM",
Value = suppressWarnings(as.numeric(DC.Value)),
Unit = str_to_lower(str_squish(DC.Unit)),
Dry_Prediction = dplyr::case_when(
Unit %in% c("%","% dm","g/100g") ~ dplyr::case_when(
Value > 85 ~ TRUE, # high DM ⇒ dry
Value < 50 ~ FALSE, # low DM ⇒ not dry
TRUE ~ NA
),
Unit == "g/kg" ~ dplyr::case_when(
Value > 850 ~ TRUE,
Value < 500 ~ FALSE,
TRUE ~ NA
),
TRUE ~ NA
)
)
) %>%
# keep one prediction per predictor per key (first non-NA)
distinct(B.Code, D.Item, Predictor, Dry_Prediction) %>%
group_by(B.Code, D.Item, Predictor) %>%
summarise(Dry_Prediction = dplyr::first(na.omit(Dry_Prediction)), .groups = "drop") %>%
# wide: Dry_Prediction_Moisture, Dry_Prediction_DM
tidyr::pivot_wider(
names_from = Predictor,
values_from = Dry_Prediction,
names_prefix = "Dry_Prediction_"
) %>%
mutate(
has_evidence = !is.na(Dry_Prediction_Moisture) | !is.na(Dry_Prediction_DM),
# precedence: DM beats Moisture when both exist
Dry_from_evidence = dplyr::case_when(
!is.na(Dry_Prediction_DM) ~ Dry_Prediction_DM,
is.na(Dry_Prediction_DM) &
!is.na(Dry_Prediction_Moisture) ~ Dry_Prediction_Moisture,
TRUE ~ NA
)
) %>%
select(B.Code, D.Item, has_evidence, Dry_from_evidence)
# --- 2) Apply evidence to ingredients ---------------------------------------
ingredients <- ingredients %>%
left_join(dry_evidence, by = c("B.Code","D.Item")) %>%
mutate(
# Evidence (DM/Moisture) is authoritative when present
DC.Is.Dry = dplyr::case_when(
Dry_from_evidence == TRUE ~ "Yes",
Dry_from_evidence == FALSE ~ "No",
TRUE ~ DC.Is.Dry
)
)
# --- 3) Diet-level propagation (only fills NA when all flagged in arm are Yes)
ingredients <- ingredients %>%
group_by(B.Code, A.Level.Name) %>%
mutate(
any_flag = any(!is.na(DC.Is.Dry), na.rm = TRUE),
all_dry = all(DC.Is.Dry[!is.na(DC.Is.Dry)] == "Yes")
) %>%
ungroup() %>%
mutate(
DC.Is.Dry = dplyr::case_when(
any_flag & all_dry & is.na(DC.Is.Dry) ~ "Yes",
TRUE ~ DC.Is.Dry
)
) %>%
select(-any_flag, -all_dry)
# --- 4) Fallback: assume DRY only when NO Moisture/DM evidence --------------
# (i.e., we cannot decide because there is no proximate data)
ingredients <- ingredients %>%
mutate(
DC.Is.Dry = dplyr::case_when(
is.na(DC.Is.Dry) & (has_evidence %in% c(FALSE, NA)) ~ "Yes",
TRUE ~ DC.Is.Dry
)
) %>%
select(-has_evidence, -Dry_from_evidence)
# --- 5) Drop pure "water" rows ----------------------------------------------
ingredients <- ingredients %>%
filter(!(str_to_lower(D.Item) == "water"))
Subset for Dry data
When feed intake data is missing and we rely on ingredient-level amounts, we need to ensure that the quantities used refer to the dry form of the ingredient. To do this, we subset the data to include only ingredients identified as dried—based on either the declared DC.Is.Dry value or our earlier flagging using ingredient names and moisture content predictions. This ensures consistency and accuracy in downstream calculations.
#take out unspecified ingredients
species_by_bcode <- merged_metadata$Prod.Out %>%
select(B.Code, P.Product) %>%
distinct()
# 2. Papers per species before filtering
# papers_before <- ingredients %>%
# select(B.Code) %>%
# distinct() %>%
# left_join(species_by_bcode, by = "B.Code") %>%
# count(P.Product, name = "papers_before")
# length(unique(ingredients$B.Code))
# Step 1: Identify A.Level.Name groups with at least one ingredient NOT dry
non_dry_diets <- ingredients %>%
filter(is.na(DC.Is.Dry) | str_to_lower(DC.Is.Dry) %in% c("no", "n", "unspecified")) %>%
distinct(B.Code, A.Level.Name)
# Step 2: Filter out these diets entirely from ingredients
ingredients <- ingredients %>%
anti_join(non_dry_diets, by = c("B.Code", "A.Level.Name"))
length(unique(ingredients$A.Level.Name))
## [1] 1903
rm(non_dry_diets)
length(unique(ingredients$B.Code))
## [1] 610
species_by_bcode <- merged_metadata$Prod.Out %>%
select(B.Code, P.Product) %>%
distinct()
#2. Papers per species before filtering
papers_after <- ingredients %>%
select(B.Code) %>%
distinct() %>%
left_join(species_by_bcode, by = "B.Code") %>%
count(P.Product, name = "papers_after")
We filter the dataset to retain only ingredients with acceptable units. Uncommon or unexpected unit formats are excluded to ensure consistency and simplify downstream processing
# Define a list of acceptable units
acceptable_units <- c(
"g/kg", "g", "%", "ml", "L", "kg", "g/L",
"g/kg metabolic weight", "% Diet",
"g/kg body weight", "ml/kg", "% Body Mass",
"% Concentrate", "g/kg body weight" # leave only one if repeated
)
# Filter rows to keep only acceptable units
ingredients <- ingredients %>%
filter(D.Unit.Amount %in% acceptable_units | is.na(D.Unit.Amount)) %>%
mutate(D.Item = ifelse(is.na(D.Item), "Unspecified", D.Item))
#tracking paper loss because of harmnozied units
#total_studies_harmonized<- length(unique(ingredients_harmonized$B.Code))
#total_ingredients_harmonized<- length(unique(ingredients_harmonized$D.Item))
#total_diets_harmonized<- length(unique(ingredients_harmonized$A.Level.Name))
rm(acceptable_units)
# Pre-filter Data.Out to keep only the first non-NA weight per group
# Step 1: From Data.Out — compute unique weights
weights_data <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Weight Gain") %>%
mutate(
Out.WG.Unit = tolower(Out.WG.Unit),
Out.WG.Start = case_when(
Out.WG.Unit == "g" ~ Out.WG.Start / 1000,
Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
Out.WG.Unit %in% c("kg", "unspecified", NA) ~ Out.WG.Start,
TRUE ~ NA_real_
),
Out.WG.Unit = ifelse(!is.na(Out.WG.Start), "kg", NA_character_)
) %>%
filter(!is.na(Out.WG.Start)) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(
Out.WG.Start = mean(Out.WG.Start, na.rm = TRUE),
Out.WG.Unit = "kg",
.groups = "drop"
)
# Step 2: From MT.Out — extract matching weight data
weights_mt <- merged_metadata$MT.Out %>%
select(B.Code, A.Level.Name, Out.WG.Start, Out.WG.Unit) %>%
filter(!is.na(Out.WG.Start)) %>%
mutate(
Out.WG.Start = as.numeric(Out.WG.Start), # <- force numeric
Out.WG.Unit = tolower(as.character(Out.WG.Unit)), # <- force character and lowercase
Out.WG.Start = case_when(
Out.WG.Unit == "g" ~ Out.WG.Start / 1000,
Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
Out.WG.Unit %in% c("kg", "unspecified", NA) ~ Out.WG.Start,
TRUE ~ NA_real_
),
Out.WG.Unit = ifelse(!is.na(Out.WG.Start), "kg", NA_character_)
) %>%
distinct(B.Code, A.Level.Name, .keep_all = TRUE)
# Step 3: Combine both sources, then de-duplicate (keep priority if desired)
weights_unique <- bind_rows(weights_data, weights_mt) %>%
arrange(B.Code, A.Level.Name) %>%
distinct(B.Code, A.Level.Name, .keep_all = TRUE)
# Step 3: Join body weight and keep only ingredients with weight
ingredients <- ingredients %>%
left_join(
weights_unique,
by = c("B.Code", "A.Level.Name"))
# Step 4: Convert D.Amount to grams based on body weight and update unit accordingly
ingredients <- ingredients %>%
mutate(
# Recalculate D.Amount using body weight or metabolic weight, result in kg
D.Amount = case_when(
D.Unit.Amount == "g/kg body weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start) / 1000,
D.Unit.Amount == "g/kg metabolic weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start^0.75) / 1000,
D.Unit.Amount == "% Body Mass" & !is.na(Out.WG.Start) ~ ((D.Amount / 100) * Out.WG.Start),
TRUE ~ D.Amount # unchanged if not convertible
),
# Update unit to "kg" if recalculated
D.Unit.Amount = case_when(
D.Unit.Amount %in% c("g/kg body weight", "g/kg metabolic weight", "% Body Mass") &
!is.na(Out.WG.Start) ~ "kg",
TRUE ~ D.Unit.Amount
)
)
For ingredients reported in relative units such as %, % Diet, or g/kg, we can estimate their absolute amounts when total diet feed intake is available. This allows us to convert proportions into standardized quantities.
missing_unit_feed_intake<- merged_metadata$Data.Out %>%
select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
ED.Mean.T, ED.Error, Out.Unit, Out.Subind,is_entire_diet,Source,Out.Code.Joined) %>%
filter(Out.Subind == "Feed Intake") %>%
filter(is.na(Out.Unit))
feed_intake <- merged_metadata$Data.Out %>%
select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
ED.Mean.T, ED.Error, Out.Unit, Out.Subind,
is_entire_diet, is_group, T.Animals) %>%
filter(Out.Subind == "Feed Intake") %>%
filter(!is.na(ED.Mean.T)) %>%
## ───────────────────────────────────────────────────────────────────────
mutate(
is_entire = is.na(ED.Intake.Item.Raw) | ED.Intake.Item.Raw == "",
ED.Intake.Item.Raw = if_else(is_entire, "Entire Diet", ED.Intake.Item.Raw),
ED.Intake.Item = if_else(is_entire, "Entire Diet", ED.Intake.Item)
) %>%
## (optionally) drop the helper flag
select(-is_entire) %>%
## ───────────────────────────────────────────────────────────────────────
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
Out.Unit.Animals = str_extract(Out.Unit,
"\\b(individual|replicate|animal replicate)\\b"),
Out.Unit.Time = str_extract(Out.Unit,
"\\b(day|d|yr|year|experiment)\\b"),
Out.Unit.Amount = Out.Unit %>%
str_remove_all("\\b(individual|replicate|animal replicate)\\b") %>%
str_remove_all("\\b(day|d|yr|year)\\b") %>%
str_replace_all("/+", "/") %>%
str_trim() %>%
str_remove("/$")
)
# Step 3: Convert replicate-level intake to individual-level when possible
feed_intake <- feed_intake %>%
mutate(
# Ensure numeric type
ED.Mean.T = as.numeric(ED.Mean.T),
T.Animals = as.numeric(T.Animals),
# Convert intake and update unit
ED.Mean.T = case_when(
Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ ED.Mean.T / T.Animals,
TRUE ~ ED.Mean.T
),
Out.Unit.Animals = case_when(
Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ "individual",
TRUE ~ Out.Unit.Animals
)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `T.Animals = as.numeric(T.Animals)`.
## Caused by warning:
## ! NAs introduced by coercion
# Define preferred units
preferred_units <- c("kg dm", "g dm")
preferred_time <- "day"
preferred_animals <- "individual"
# Create filtered table with prioritization
feed_intake <- feed_intake %>%
mutate(
priority = case_when(
Out.Unit.Amount %in% preferred_units &
Out.Unit.Time == preferred_time &
Out.Unit.Animals == preferred_animals &
!is.na(ED.Error) ~ 1,
Out.Unit.Amount %in% preferred_units &
Out.Unit.Time == preferred_time &
Out.Unit.Animals == preferred_animals ~ 2,
TRUE ~ 3
)
) %>%
group_by(B.Code, A.Level.Name, ED.Intake.Item) %>%
slice_min(order_by = priority, with_ties = FALSE) %>%
ungroup() %>%
select(-priority) # optional
### keep unique units of ingredients
# Step 3: Join body weight
feed_intake <- feed_intake %>%
left_join(
weights_unique %>%
select(B.Code, A.Level.Name, Out.WG.Start, Out.WG.Unit),
by = c("B.Code", "A.Level.Name")
) %>%
group_by(A.Level.Name) %>%
mutate(
Out.WG.Start = ifelse(is.na(Out.WG.Start),
first(na.omit(Out.WG.Start)),
Out.WG.Start)
) %>%
ungroup() %>%
distinct()
# Step 4: Convert ED.Mean.T to kg/day
feed_intake <- feed_intake %>%
mutate(
Out.Unit = trimws(tolower(Out.Unit)),
Out.Unit.Amount = trimws(str_remove(Out.Unit.Amount, "/$")),
# Convert ED.Mean.T to kg/day where possible
ED.Mean.T = case_when(
Out.Unit.Amount %in% c("g", "g dm") ~ ED.Mean.T / 1000,
Out.Unit.Amount %in% c("kg", "kg dm") ~ ED.Mean.T,
Out.Unit.Amount == "g dm/100 g" ~ ED.Mean.T * 10 / 1000,
Out.Unit.Amount == "g/week" ~ ED.Mean.T / 7 / 1000,
Out.Unit.Amount == "mg" ~ ED.Mean.T / 1e6,
# Body weight–based conversions only if Out.WG.Start is not NA
Out.Unit.Amount %in% c("g dm/kg metabolic weight", "g/kg metabolic weight", "g dm/kg metabolic weight") & !is.na(Out.WG.Start) ~
(ED.Mean.T * Out.WG.Start^0.75) / 1000,
Out.Unit.Amount == "kg dm/kg metabolic weight" & !is.na(Out.WG.Start) ~
ED.Mean.T * Out.WG.Start^0.75,
Out.Unit.Amount %in% c("g dm/kg body weight", "g/kg body weight", "g/g body weight") & !is.na(Out.WG.Start) ~
(ED.Mean.T * Out.WG.Start) / 1000,
Out.Unit.Amount %in% c("g/kg body weight (0.75)", "g dm/kg body weight (0.75)") & !is.na(Out.WG.Start) ~
(ED.Mean.T * Out.WG.Start^0.75) / 1000,
Out.Unit.Amount %in% c("% body weight", "dm % body weight") & !is.na(Out.WG.Start) ~
(ED.Mean.T / 100) * Out.WG.Start,
TRUE ~ ED.Mean.T # fallback: leave unchanged
),
# Only update unit if we were able to convert to kg
Out.Unit.Amount = case_when(
Out.Unit.Amount %in% c("g", "g dm", "g dm/100 g", "g/week", "mg") ~ "kg",
Out.Unit.Amount %in% c("g/kg body weight", "g dm/kg body weight", "g/g body weight") & !is.na(Out.WG.Start) ~ "kg",
Out.Unit.Amount %in% c("g/kg metabolic weight", "g dm/kg metabolic weight", "g dm/kg metabolic weight") & !is.na(Out.WG.Start) ~ "kg",
Out.Unit.Amount %in% c("kg dm/kg metabolic weight") & !is.na(Out.WG.Start) ~ "kg",
Out.Unit.Amount %in% c("g/kg body weight (0.75)", "g dm/kg body weight (0.75)") & !is.na(Out.WG.Start) ~ "kg",
Out.Unit.Amount %in% c("% body weight", "dm % body weight") & !is.na(Out.WG.Start) ~ "kg",
TRUE ~ Out.Unit.Amount # keep the original if no conversion
)
) %>%
select(-Out.Unit)
# Retain most interpretable row per intake item
feed_intake <- feed_intake %>%
group_by(B.Code, A.Level.Name, ED.Intake.Item.Raw) %>%
arrange(
desc(!is.na(ED.Mean.T)),
desc(Out.Unit.Amount == "kg"),
desc(Out.Unit.Animals == "individual"),
desc(Out.Unit.Time == "day")
) %>%
slice(1) %>%
ungroup()
Performs basic outlier checks on feed intake data, identifying suspiciously high or low intake values based on predefined unit thresholds.
# Assuming your data is stored in a data frame named `feed_intake_data`
# Filter for units that imply kg/day/individual and check if ED.Mean.T > 50
# Identify suspicious feed intake values
## ---- detect_suspicious_feed_intake -----------------------------------------
# Thresholds
high_kg <- 50 # > 50 kg d-1 animal-1 ⟹ unrealistically high
low_kg <- 0.1 # < 0.1 kg d-1 animal-1 ⟹ unrealistically low
low_g_min <- 1 # 1–20 g range unlikely for daily intake
low_g_max <- 20
suspicious_feed_intake <- feed_intake %>%
mutate(
flag = case_when(
Out.Unit.Amount %in% c("kg", "kg dm") &
Out.Unit.Time == "day" &
Out.Unit.Animals == "individual" & ED.Mean.T > high_kg ~ "too_high_kg",
Out.Unit.Amount %in% c("kg", "kg dm") &
Out.Unit.Time == "day" &
Out.Unit.Animals == "individual" & ED.Mean.T < low_kg ~ "too_low_kg",
Out.Unit.Amount %in% c("g", "g dm") &
ED.Mean.T >= low_g_min & ED.Mean.T <= low_g_max ~ "implausibly_low_g",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(flag)) %>%
select(B.Code, A.Level.Name, ED.Mean.T,
Out.Unit.Amount, Out.Unit.Time, Out.Unit.Animals, flag)
print(suspicious_feed_intake)
## # A tibble: 606 × 7
## B.Code A.Level.Name ED.Mean.T Out.Unit.Amount Out.Unit.Time Out.Unit.Animals
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 AG0051 NG 0.0083 kg day individual
## 2 AG0051 NGD 0.0082 kg day individual
## 3 AG0051 PG 0.0091 kg day individual
## 4 AG0051 PGD 0.0092 kg day individual
## 5 AN0007 T1 0.0696 kg day individual
## 6 AN0007 T2 0.0735 kg day individual
## 7 AN0007 T3 0.0808 kg day individual
## 8 AN0007 T4 0.0769 kg day individual
## 9 AN0007 T5 0.0802 kg day individual
## 10 AN0020 Control T1 0.0503 kg day individual
## # ℹ 596 more rows
## # ℹ 1 more variable: flag <chr>
low_intake_codes <- feed_intake %>%
filter(
# (a) < 0.1 kg / day / individual
(Out.Unit.Amount %in% c("kg", "kg dm") &
Out.Unit.Time == "day" &
Out.Unit.Animals == "individual" &
ED.Mean.T < 0.1) |
# (b) 1–20 g reported in gram units
(Out.Unit.Amount %in% c("g", "g dm") &
ED.Mean.T >= 1 & ED.Mean.T <= 20)
) %>%
distinct(B.Code) %>% # keep each code only once
pull() # return as a plain character vector
low_intake_codes
## [1] "AG0051" "AN0007" "AN0020" "B01063" "BO1012"
## [6] "BO1015" "BO1017" "BO1030" "BO1042" "BO1048"
## [11] "BO1061" "BO1062" "BO1064" "BO1075" "BO1095"
## [16] "BO1099" "CJ1001" "CJ1006" "DK0037" "EM1000"
## [21] "EM1049" "EM1060" "EM1062" "EM1065" "EM1066"
## [26] "EM1068" "EM1070" "EM1077" "EM1081" "EM1092"
## [31] "EM1097" "EM1100" "EO0032" "EO0047" "EO0086"
## [36] "EO0102" "HK0005" "HK0007" "HK0011" "HK0014"
## [41] "HK0017" "HK0037" "HK0045" "HK0066.2" "HK0067"
## [46] "HK0102.2" "HK0118" "HK0228" "HK0260" "HK0319"
## [51] "HK0331" "HK0333" "HK0337" "JO1001" "JO1010"
## [56] "JO1015" "JO1020" "JO1022" "JO1023" "JO1025"
## [61] "JO1026" "JO1027" "JO1033" "JO1034" "JO1036"
## [66] "JO1038" "JO1040" "JO1041" "JO1042" "JO1044"
## [71] "JO1057" "JO1068" "JO1069" "JO1070" "JO1071"
## [76] "JO1074" "JO1075" "JO1077" "JO1078" "JS0047"
## [81] "JS0048" "JS0076" "JS0079.1" "JS0093" "JS0095"
## [86] "JS0123" "JS0124" "JS0125" "JS0134" "JS0135"
## [91] "JS0139" "JS0177" "JS0192" "JS0193" "JS0206"
## [96] "JS0207" "JS0208" "JS0209" "JS0281" "JS0291"
## [101] "JS0295" "JS0296" "JS0309" "JS0342" "JS0370"
## [106] "JS0371" "JS0393" "LM0120" "LM0266" "LM0277.1"
## [111] "LM0289" "LM0310" "LM0312" "LM0334" "LM0335"
## [116] "LM0354" "NJ0003" "NJ0047" "NN0021" "NN0083"
## [121] "NN0085" "NN0098" "NN0108" "NN0114" "NN0115"
## [126] "NN0118" "NN0119.2" "NN0189" "NN0198" "NN0201"
## [131] "NN0205" "NN0211" "NN0212" "NN0214" "NN0230"
## [136] "NN0251" "NN0272.1" "NN0272.1.2" "NN0272.2" "NN0321"
## [141] "NN0325" "NN0326" "NN0358" "NN0359" "NN0367"
## [146] "NN0377" "NN0396"
Some papers were identified with errors. This section corrects those error if they are still present before the next update of the data.
# Step: Convert suspiciously high kg values that are likely mislabelled g
feed_intake <- feed_intake %>%
mutate(
Out.Unit.Amount = case_when(
Out.Unit.Amount == "kg" & Out.Unit.Animals == "individual" & Out.Unit.Time == "day" & ED.Mean.T > 50 ~ "g", # correct mislabeling
TRUE ~ Out.Unit.Amount
)
)
If feed intake data is there for the entire diet and that the ingredients in ingredients dataset are in % or % of diet or g/kg we can convert
When we translate relative inclusion rates (“% of diet”, “g kg⁻¹
diet”) into absolute grams of dry matter, the correct denominator
depends on how feed‑intake data are structured. The table below
summarises every layout we encounter, what it looks like in the two raw
data frames (feed_intake and ingredients), how
the percentages should be interpreted, and therefore which intake
value—total diet or group‑level mix—must be used for the conversion.
| Scenario | What you see in feed_intake | What you see in ingredients | What the % or g kg⁻¹ actually mean | Correct denominator for conversion |
|---|---|---|---|---|
| A — No diet groups | One row per diet level marked Entire Diet | Every ingredient row has D.Item.Group = NA |
“%” = % of whole diet “g/kg” = g per kg whole diet |
Total‑diet intake (intake_total) |
| B — Mix is part of diet | Two + rows per diet level: • is_group = TRUE for
each mix (with its own intake)• is_entire_diet = TRUE |
Rows inside the mix have D.Item.Group = mix name;
stand‑alone ingredients have NA |
Inside‑mix “%” = % of mix Stand‑alone “%” = % of whole diet |
Inside‑mix → group intake
(intake_group)Stand‑alone → total‑diet intake |
| C — Mix is the whole diet | Only one is_group = TRUE row and that diet level has
no stand‑alone ingredients |
Every ingredient has the same D.Item.Group |
“%” = % of whole diet, even though recorded under a group label | Total‑diet intake (because group = total) |
| D — Group intake duplicated in ingredients | — | A row like D.Item == "Unspecified" in kg day⁻¹ that
repeats the group’s intake |
Not a nutrient source → should be removed | Drop the row entirely |
| E — Ingredient in feed_intake but not in ingredients | A stand‑alone is_group = FALSE row |
Missing ingredient row | We still need this ingredient and its intake | Create a synthetic ingredient row with
grams = ED.Mean.T × 1000 |
# Flag any row whose description is literally “Entire Diet”
feed_intake <- feed_intake %>%
mutate(
is_entire_diet = ifelse(
grepl("^\\s*entire\\s*diet\\s*$",
tolower(coalesce(ED.Intake.Item, ED.Intake.Item.Raw, "")),
ignore.case = TRUE),
TRUE,
is_entire_diet
)
)
# ─────────────────────────────────────────────────────────────────────────
# 0 ▸ Ensure we have a Source column (so we can tag “extracted” vs feed_intake)
# ─────────────────────────────────────────────────────────────────────────
ingredients <- ingredients %>%
{ if (!"Source" %in% names(.)) mutate(., Source = NA_character_) else . }
# ─────────────────────────────────────────────────────────────────────────
# 1 ▸ Pull diet‑level intakes
# • total_intake = “Entire Diet” rows
# • mix_intake = rows flagged is_group
# ─────────────────────────────────────────────────────────────────────────
intake_total <- feed_intake %>%
filter(is_entire_diet) %>%
transmute(B.Code, A.Level.Name, total_intake = ED.Mean.T)
intake_group <- feed_intake %>%
filter(is_group) %>%
transmute(B.Code, A.Level.Name,
D.Item.Group = ED.Intake.Item,
mix_intake = ED.Mean.T)
# ─────────────────────────────────────────────────────────────────────────
# 2 ▸ Drop “container” rows (scenario D)
# ─────────────────────────────────────────────────────────────────────────
group_lookup <- ingredients %>%
filter(!is.na(D.Item.Group)) %>%
distinct(B.Code, A.Level.Name, grp_name = str_trim(D.Item.Group))
ingredients <- ingredients %>%
left_join(group_lookup, by = c("B.Code","A.Level.Name")) %>%
filter(
!(D.Item=="Unspecified" &
D.Unit.Amount=="kg" &
D.Unit.Time=="day" &
!is.na(grp_name) &
str_trim(D.Type)==grp_name)
) %>%
select(-grp_name)
## Warning in left_join(., group_lookup, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 5 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# ─────────────────────────────────────────────────────────────────────────
# 3 ▸ Attach intakes & flag pure‑mix diets (scenarios A–C)
# ─────────────────────────────────────────────────────────────────────────
group_flag <- ingredients %>%
left_join(intake_total, by = c("B.Code","A.Level.Name")) %>%
left_join(intake_group, by = c("B.Code","A.Level.Name","D.Item.Group")) %>%
group_by(B.Code,A.Level.Name) %>%
summarise(one_mix = all(!is.na(D.Item.Group)) &&
n_distinct(D.Item.Group)==1,
.groups="drop")
## Warning in left_join(., intake_total, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8702 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ingredients <- ingredients %>%
left_join(group_flag, by = c("B.Code","A.Level.Name")) %>%
left_join(intake_total, by = c("B.Code","A.Level.Name")) %>%
left_join(intake_group, by = c("B.Code","A.Level.Name","D.Item.Group"))
## Warning in left_join(., intake_total, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8702 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# ─────────────────────────────────────────────────────────────────────────
# 4 ▸ Convert “%” / “g/kg” → grams DM/day
# A: no group → total_intake
# B: group part‑of‑diet → mix_intake for that ingredient
# C: pure‑mix → still use total_intake
# ─────────────────────────────────────────────────────────────────────────
ingredients <- ingredients %>%
mutate(
base_intake = case_when(
D.Unit.Amount %in% c("%","% Diet") &
!is.na(mix_intake) & !one_mix ~ mix_intake,
D.Unit.Amount %in% c("%","% Diet","g/kg") ~ total_intake,
TRUE ~ NA_real_
),
D.Amount = case_when(
D.Unit.Amount %in% c("%","% Diet") & !is.na(base_intake) ~
(D.Amount/100) * base_intake * 1000,
D.Unit.Amount=="g/kg" & !is.na(base_intake) ~
D.Amount * base_intake,
TRUE ~ D.Amount
),
D.Unit.Amount = if_else(
D.Unit.Amount %in% c("%","% Diet","g/kg") & !is.na(base_intake),
"g", D.Unit.Amount
),
Source = coalesce(Source,"extracted"),
basal_intake_status = NA_character_
) %>%
select(-base_intake, -total_intake, -mix_intake, -one_mix)
# ─────────────────────────────────────────────────────────────────────────
# 5 ▸ Import truly missing items as basal diets (scenario E)
# ─────────────────────────────────────────────────────────────────────────
missing_basals <- feed_intake %>%
filter(!is_group, !is_entire_diet) %>%
anti_join(ingredients,
by = c("B.Code","A.Level.Name","ED.Intake.Item"="D.Item"))
basal_ingredients <- missing_basals %>%
transmute(
B.Code,
A.Level.Name = paste0("Base_",A.Level.Name),
D.Item = ED.Intake.Item,
D.Type = "Imported_from_feed_intake",
D.Amount = case_when(
Out.Unit.Amount=="kg" ~ ED.Mean.T*1000,
Out.Unit.Amount=="g" ~ ED.Mean.T,
TRUE ~ NA_real_
),
D.Unit.Amount = "g",
D.Unit.Time = Out.Unit.Time,
D.Unit.Animals = Out.Unit.Animals,
DC.Is.Dry = NA,
D.Ad.lib = NA,
D.Item.Group = NA_character_,
Units = NA,
T.Animals = T.Animals,
Prediction_Source = NA,
Prediction_Source.from_AName = NA,
Out.WG.Start = Out.WG.Start,
Out.WG.Unit = Out.WG.Unit,
Source = "feed_intake",
basal_intake_status = "imported"
)
# ─────────────────────────────────────────────────────────────────────────
# 6 ▸ Combine & drop only imported basals that still lack an amount
# ─────────────────────────────────────────────────────────────────────────
ingredients <- bind_rows(ingredients, basal_ingredients)
We excluded the basal diets because, in most of the livestock trials in our database, the basal ration is offered ad libitum and therefore lacks two pieces of information that are essential for a fair, productivity-based comparison: (i) actual feed-intake measurements and (ii) corresponding enteric-methane estimates. Without intake data we cannot convert emissions—or, in many cases, even estimate them—on a per-kilogram-of-product basis, and retaining these records would artificially inflate the number of “treatments” while contributing no usable information to the efficiency analyses. By focusing on the control diet (with measured intake) and the formulated treatment diets, we keep a coherent set of observations in which every data point can be expressed as both absolute emissions and emission intensity, allowing robust, like-for-like comparisons across studies.
ingredients <- ingredients %>%
filter(!str_starts(A.Level.Name, "Base"))
ingredients <- ingredients %>%
filter(!str_starts(D.Item.Group, "Base"))
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
filter(!str_starts(D.Item, "Base"))
We prioritize feed intake data over the amounts reported in the methods section, as feed intake is the key variable needed for emission calculations. When feed intake data is unavailable, we fall back on the harmonized ingredient amounts extracted earlier.
# Step 5: Remove feed intake entries that are actually full-diet labels
feed_intake_ingredients <- feed_intake %>%
filter(
str_trim(ED.Intake.Item) != str_trim(A.Level.Name) &
str_trim(ED.Intake.Item) != "Entire Diet" &
str_trim(ED.Intake.Item.Raw) != "Entire Diet"
) %>%
filter(!is.na(ED.Mean.T))
# Step 6: Merge with ingredients and override with ED.Mean.T when available
# FIXED VERSION
# Merge with ingredients and override with ED.Mean.T when available
ingredients <- ingredients %>%
left_join(
feed_intake_ingredients %>%
select(B.Code, A.Level.Name, ED.Intake.Item, ED.Mean.T, Out.Unit.Amount, Out.Unit.Animals, Out.Unit.Time),
by = c("B.Code", "A.Level.Name", "D.Item" = "ED.Intake.Item")
) %>%
mutate(
from_feed_intake = !is.na(ED.Mean.T),
D.Amount = if_else(from_feed_intake, ED.Mean.T, D.Amount),
D.Unit.Amount = if_else(from_feed_intake & !is.na(Out.Unit.Amount), Out.Unit.Amount, D.Unit.Amount),
D.Unit.Animals = if_else(from_feed_intake & !is.na(Out.Unit.Animals), Out.Unit.Animals, D.Unit.Animals),
D.Unit.Time = if_else(from_feed_intake & !is.na(Out.Unit.Time), Out.Unit.Time, D.Unit.Time)
) %>%
select(-ED.Mean.T, -Out.Unit.Amount, -Out.Unit.Animals, -Out.Unit.Time) %>%
distinct()
# Step 8: Final conversion of unit to grams (g)
ingredients <- ingredients %>%
mutate(
D.Amount = case_when(
D.Unit.Amount == "kg" ~ D.Amount * 1000,
TRUE ~ D.Amount
),
D.Unit.Amount = case_when(
D.Unit.Amount == "kg" ~ "g",
TRUE ~ D.Unit.Amount
)
)
TO do : See BO1070 we want the full diet to be included from feed intake which is not the case now see feed intake data the goal would be to add rows in ingredient to add info at the entire diet level have a close look to make it the same format as entire diets already in ingredients
#
# # --- 0 ▸ helper --------------------------------------------------------------
# # Convert relative (% / g kg⁻¹) units to grams given a denominator vector
# rel_to_abs <- function(df, denom) {
# df %>% mutate(
# D.Amount = case_when(
# D.Unit.Amount %in% c("%", "% Diet") ~ (D.Amount / 100) * denom * 1000,
#
# TRUE ~ D.Amount
# ),
# D.Unit.Amount = if_else(D.Unit.Amount %in% c("%", "% Diet"),
# "g", D.Unit.Amount)
# )
# }
#
# # ---------------------------------------------------------------------------
# # 1 ▸ build diet‑ and mix‑level intake look‑ups -----------------------------
# intake_total <- feed_intake %>%
# filter(is_entire_diet) %>%
# transmute(B.Code, A.Level.Name, diet_intake_kg = ED.Mean.T)
#
# intake_group <- feed_intake %>%
# filter(is_group) %>%
# transmute(B.Code, A.Level.Name, D.Item.Group = ED.Intake.Item, mix_intake_kg = ED.Mean.T)
#
# # ---------------------------------------------------------------------------
# # 2 ▸ organise diet structure ----------------------------------------------
# group_flag <- ingredients %>%
# group_by(B.Code, A.Level.Name) %>%
# summarise(first_group = first(na.omit(D.Item.Group)),
# all_one_group = all(!is.na(D.Item.Group)) & n_distinct(D.Item.Group[!is.na(D.Item.Group)]) == 1,
# .groups = "drop")
#
# intake_denoms <- group_flag %>%
# left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
# left_join(intake_group, by = c("B.Code", "A.Level.Name", "first_group" = "D.Item.Group"))
#
# # ---------------------------------------------------------------------------
# # 3 ▸ convert % / g kg⁻¹ rows -----------------------------------------------
# relative_rows <- ingredients %>% filter(D.Unit.Amount %in% c("%", "% Diet"))
#
# ingredients_relative_fixed <- relative_rows %>%
# left_join(intake_denoms, by = c("B.Code", "A.Level.Name")) %>%
# mutate(den = coalesce(mix_intake_kg, diet_intake_kg)) %>%
# rel_to_abs(denom = .$den) %>%
# select(-den) %>%
# mutate(Source = "converted") # tag for precedence
#
# # ---------------------------------------------------------------------------
# # 4 ▸ clone pure <Base> diets into each treatment ---------------------------
# basal_diets <- ingredients %>% filter(grepl("^Base$", A.Level.Name, ignore.case = TRUE))
# expand_basals <- function(code) {
# trts <- ingredients %>% filter(B.Code == code & !grepl("^Base", A.Level.Name, ignore.case = TRUE)) %>% pull(A.Level.Name) %>% unique()
# map_dfr(trts, \(trt) basal_diets %>% filter(B.Code == code) %>% mutate(A.Level.Name = trt))
# }
# basal_expanded <- basal_diets %>% pull(B.Code) %>% unique() %>% map_dfr(expand_basals)
#
# # ---------------------------------------------------------------------------
# # 5 ▸ rows coming from feed_intake with prefixed diet name ------------------
# imported_match <- ingredients %>%
# filter(Source == "feed_intake" & grepl("^Base_", A.Level.Name)) %>%
# mutate(A.Level.Name = sub("^Base_", "", A.Level.Name))
#
# # ---------------------------------------------------------------------------
# # 6 ▸ assemble with precedence ---------------------------------------------
# combine_tbl <- bind_rows(
# imported_match, # 1 author‑measured grams
# ingredients_relative_fixed, # 2 converted grams (from originals)
# ingredients %>% anti_join(relative_rows, by = c("B.Code","A.Level.Name","D.Item")), # 3 originals
# basal_expanded # 4 clones
# ) %>%
# filter(!grepl("^Base_", A.Level.Name, ignore.case = TRUE))
#
# # ---------------------------------------------------------------------------
# # 6b ▸ SECOND‑PASS conversion for cloned % rows --------------------------------
# # ---------------------------------------------------------------------------
# leftover_rel <- combine_tbl %>%
# filter(D.Unit.Amount %in% c("%", "% Diet")) %>%
# # drop any pre‑existing diet_intake columns to avoid .x/.y suffixes
# select(-matches("^diet_intake_kg$")) %>%
# left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
# mutate(
# D.Amount = case_when(
# D.Unit.Amount %in% c("%", "% Diet") ~ (D.Amount / 100) * diet_intake_kg * 1000,
# TRUE ~ D.Amount
# ),
# D.Unit.Amount = "g",
# Source = ifelse(Source == "extracted", "converted", Source)
# ) %>%
# select(-diet_intake_kg)
#
# combine_tbl <- combine_tbl %>%
# anti_join(leftover_rel, by = c("B.Code", "A.Level.Name", "D.Item")) %>%
# bind_rows(leftover_rel)
#
# # precedence ---------------------------------------------------------------
# src_levels <- c("feed_intake", "converted", "extracted", "basal_expanded")
#
# ingredients <- combine_tbl %>%
# mutate(Source = factor(Source, levels = src_levels, ordered = TRUE)) %>%
# arrange(B.Code, A.Level.Name, D.Item, Source) %>%
# distinct(B.Code, A.Level.Name, D.Item, .keep_all = TRUE) %>%
# ungroup() %>%
# mutate(Source = as.character(Source),
# Source = ifelse(is.na(Source), "basal_expanded", Source))
#
# # ---------------------------------------------------------------------------
# # 7 ▸ single‑NA remainder fill — **disabled at user request**
# # ---------------------------------------------------------------------------
# # The block that back‑calculates a missing basal amount has been removed for
# # now, so any ingredient still lacking `D.Amount` will remain NA.
# # ---------------------------------------------------------------------------
#
# # ---------------------------------------------------------------------------
# # 8 ▸ integrity check -------------------------------------------------------
# qc_totals <- ingredients %>%
# group_by(B.Code, A.Level.Name) %>%
# summarise(sum_g = sum(D.Amount, na.rm = TRUE), .groups = "drop") %>%
# left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
# mutate(rel_diff = abs(sum_g/1000 - diet_intake_kg) / diet_intake_kg) %>%
# filter(rel_diff > 0.10)
#
# if(nrow(qc_totals) > 0) warning("Diets with >10 % mass‑balance error present; see qc_totals.")
#
# # ingredients_final is now harmonised (g DM d⁻¹) and ready for GE → CH₄
# ─────────────────────────────────────────────────────────────────────────────
# 1 ▸ diet-level intake DIRECTLY from feed_intake -----------------------------
feed_intake_diet <- feed_intake %>%
filter(is_entire_diet) %>% # just the whole-diet rows
transmute(B.Code, A.Level.Name,
diet_intake_kg = ED.Mean.T, # already in kg DM d-1
Source = "feed_intake")
# ─────────────────────────────────────────────────────────────────────────────
# 2 ▸ diet-level intake CALCULATED from ingredients_final --------------------
diet_intake_calc <- ingredients %>%
group_by(B.Code, A.Level.Name) %>%
summarise(diet_intake_kg = sum(D.Amount, na.rm = TRUE) / 1000, # g → kg
.groups = "drop") %>%
mutate(Source = "sum_ingredients")
# ─────────────────────────────────────────────────────────────────────────────
# 3 ▸ combine, giving precedence to the direct value -------------------------
feed_intake_diet <- feed_intake_diet %>%
full_join(diet_intake_calc,
by = c("B.Code", "A.Level.Name"),
suffix = c("_feed", "_calc")) %>%
transmute(
B.Code, A.Level.Name,
diet_intake_kg = coalesce(diet_intake_kg_feed, diet_intake_kg_calc),
Source = ifelse(!is.na(diet_intake_kg_feed),
"feed_intake", "sum_ingredients")
)
# diet_intake_final is ready to use
expand basal diets and see in feed intake if we have the basal diet for each treatment
library(dplyr)
library(stringr)
library(tidyr)
library(DT)
library(readr)
# --- Source data shortcuts ----------------------------------------------------
fi_raw <- merged_metadata$Data.Out %>%
filter(Out.Subind == "Feed Intake")
species_info <- merged_metadata$Prod.Out %>%
distinct(B.Code, Species = P.Product)
diets_per_paper <- merged_metadata$Animals.Diet %>%
distinct(B.Code, A.Level.Name) %>%
count(B.Code, name = "n_diets")
# --- Helper: detect "Entire Diet" rows in feed intake ------------------------
fi_marked <- fi_raw %>%
mutate(
ED.Intake.Item = str_squish(coalesce(ED.Intake.Item, "")),
ED.Intake.Item.Raw = str_squish(coalesce(ED.Intake.Item.Raw, "")),
is_entire_diet_row =
ED.Intake.Item == "" |
str_detect(ED.Intake.Item, regex("^entire\\s*diet$", ignore_case = TRUE)) |
str_detect(ED.Intake.Item.Raw, regex("^entire\\s*diet$", ignore_case = TRUE))
)
# --- Summaries per paper (B.Code) --------------------------------------------
fi_by_paper <- fi_marked %>%
group_by(B.Code) %>%
summarise(
n_feed_rows = n(),
n_with_value = sum(!is.na(ED.Mean.T)),
n_with_unit = sum(!is.na(Out.Unit)),
n_entire_diet_rows = sum(is_entire_diet_row, na.rm = TRUE),
has_feed_intake = any(!is.na(ED.Mean.T)),
has_entire_diet_intake= any(is_entire_diet_row & !is.na(ED.Mean.T), na.rm = TRUE),
.groups = "drop"
)
all_papers <- merged_metadata$Animals.Diet %>%
distinct(B.Code) %>%
left_join(species_info, by = "B.Code") %>%
left_join(diets_per_paper, by = "B.Code") %>%
left_join(fi_by_paper, by = "B.Code") %>%
mutate(
n_diets = replace_na(n_diets, 0L),
n_feed_rows = replace_na(n_feed_rows, 0L),
n_with_value = replace_na(n_with_value, 0L),
n_with_unit = replace_na(n_with_unit, 0L),
n_entire_diet_rows = replace_na(n_entire_diet_rows, 0L),
has_feed_intake = replace_na(has_feed_intake, FALSE),
has_entire_diet_intake = replace_na(has_entire_diet_intake, FALSE),
Status = if_else(has_feed_intake, "Has feed-intake data", "No feed-intake data")
) %>%
select(
B.Code, Species, n_diets,
Status,
n_feed_rows, n_with_value, n_with_unit, n_entire_diet_rows,
has_entire_diet_intake
) %>%
arrange(Species, desc(Status), B.Code)
# --- Handy vectors if you need them elsewhere --------------------------------
with_feed_intake <- all_papers %>% filter(Status == "Has feed-intake data") %>% pull(B.Code)
without_feed_intake <- all_papers %>% filter(Status == "No feed-intake data") %>% pull(B.Code)
# --- Save CSV ----------------------------------------------------------------
write_csv(all_papers, "paper_feed_intake_summary.csv")
# --- Show interactive table --------------------------------------------------
datatable(
all_papers,
rownames = FALSE,
filter = "top",
extensions = c("Buttons"),
options = list(
pageLength = 25,
dom = "Bfrtip",
buttons = list("copy", "csv", "excel"),
autoWidth = TRUE
)
)
Not all ingredients and diets are currently included in the digestibility and composition tables—only those for which data is available. We aim to add the missing ones to help complete the dataset and enable more comprehensive analysis. when basal diet amount is expressed in % deduce first from the total and then see the feed intake vfalue
## 1 · ingredient‑level rows (what you already had)
items_tbl <- merged_metadata$Animals.Diet %>%
distinct(B.Code, A.Level.Name, D.Item, Source) %>%
mutate(
D.Item = coalesce(D.Item, A.Level.Name), # fall back to level name
kind = "Item",
is_group = FALSE
) %>%
select(B.Code, D.Item, kind,is_group, Source)
## 2 · group‑level rows (PP, “Cactus‑Acacia …”, etc.)
groups_tbl <- merged_metadata$Animals.Diet %>%
filter(!is.na(D.Item.Group)) %>% # keep only lines that have a group tag
distinct(B.Code, D.Item.Group, Source) %>%
mutate(
kind = "Group",
is_group = TRUE) %>%
rename(D.Item = D.Item.Group) %>%
select(B.Code, D.Item, kind,is_group, Source)
## 3 · bind the two together and drop duplicates
all_diet_items <- bind_rows(items_tbl, groups_tbl) %>% distinct()
# --- Step 2: Get unique variables in the nutrition and digestibility datasets ---
nutrition_variables <- unique(na.omit(merged_metadata$Animals.Diet.Comp$DC.Variable))
digestibility_variables <- unique(na.omit(merged_metadata$Animals.Diet.Digest$DD.Variable))
# --- Step 3: Expand to full grid of all D.Item × variables (by B.Code and Source) ---
# Nutrition (Comp)
comp_missing <- all_diet_items %>%
crossing(DC.Variable = nutrition_variables) %>%
anti_join(
merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item,is_group, DC.Variable),
by = c("B.Code", "D.Item", "DC.Variable","is_group")
) %>%
mutate(
DC.Value = NA_real_,
DC.Unit = NA_character_
)
# Digestibility (Digest)
digest_missing <- all_diet_items %>%
crossing(DD.Variable = digestibility_variables) %>%
anti_join(
merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item,is_group, DD.Variable),
by = c("B.Code", "D.Item", "DD.Variable","is_group")
) %>%
mutate(
DD.Value = NA_real_,
DD.Unit = NA_character_
)
# --- Step 4: Bind the missing values into the existing datasets ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_missing)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_missing)
# --- Step 5: Ensure each ingredient has a row for each nutrition variable ---
comp_completion <- merged_metadata$Animals.Diet.Comp %>%
distinct(B.Code, D.Item, is_entire_diet, Source) %>%
crossing(DC.Variable = nutrition_variables) %>%
anti_join(
merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item, DC.Variable),
by = c("B.Code", "D.Item", "DC.Variable")
) %>%
mutate(
DC.Value = NA_real_,
DC.Unit = NA_character_
)
# --- Step 6: Ensure each ingredient has a row for each digestibility variable ---
digest_completion <- merged_metadata$Animals.Diet.Digest %>%
distinct(B.Code, D.Item, is_entire_diet, Source) %>%
crossing(DD.Variable = digestibility_variables) %>%
anti_join(
merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item, DD.Variable),
by = c("B.Code", "D.Item", "DD.Variable")
) %>%
mutate(
DD.Value = NA_real_,
DD.Unit = NA_character_
)
# --- Step 7: Bind additional completion rows ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_completion)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_completion)
rm(comp_completion,comp_missing,digest_completion,digest_missing)
##fill is entire diet column
library(dplyr)
library(stringr)
# Build lookups from Animals.Diet (ingredients and diets)
ingredient_lookup <- merged_metadata$Animals.Diet %>%
transmute(B.Code,
token = str_squish(str_to_lower(as.character(D.Item)))) %>%
filter(token != "") %>%
distinct() %>%
mutate(is_ingredient_token = TRUE)
diet_lookup <- merged_metadata$Animals.Diet %>%
transmute(B.Code,
token = str_squish(str_to_lower(as.character(A.Level.Name)))) %>%
filter(token != "") %>%
distinct() %>%
mutate(is_diet_token = TRUE)
# Helper to fill is_entire_diet in a given table (Comp or Digest)
fill_is_entire <- function(df) {
df %>%
mutate(
D.Item_norm = str_squish(str_to_lower(as.character(D.Item))),
is_entire_literal = str_detect(D.Item_norm, "^(entire|whole)\\s*diet$")
) %>%
left_join(ingredient_lookup, by = c("B.Code", "D.Item_norm" = "token")) %>%
left_join(diet_lookup, by = c("B.Code", "D.Item_norm" = "token")) %>%
mutate(
is_entire_diet = case_when(
is_entire_literal ~ TRUE, # literal Entire/Whole Diet
!is.na(is_ingredient_token) ~ FALSE, # matches ingredient
!is.na(is_diet_token) ~ TRUE, # matches diet name
TRUE ~ FALSE
),
# Optional: if you keep is_group in these tables and want groups never entire:
is_entire_diet = if ("is_group" %in% names(.)) if_else(is_group, FALSE, is_entire_diet) else is_entire_diet
) %>%
select(-D.Item_norm, -is_entire_literal, -is_ingredient_token, -is_diet_token)
}
# Apply to both tables AFTER your completion/bind steps
merged_metadata$Animals.Diet.Comp <- fill_is_entire(merged_metadata$Animals.Diet.Comp)
merged_metadata$Animals.Diet.Digest <- fill_is_entire(merged_metadata$Animals.Diet.Digest)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
distinct(B.Code, P.Product)
# Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
# filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
# distinct(B.Code) %>%
# left_join(species_by_bcode, by = "B.Code") %>%
# count(P.Product, name = "papers_with_GE")
#
# # View result
# ge_papers
# ── 2. Build a “variable → units” summary table ───────────────────────────────
table_units <- merged_metadata$Animals.Diet.Comp %>%
# keep rows where a unit is actually recorded
filter(!is.na(DC.Unit) & DC.Unit != "NA" & str_trim(DC.Unit) != "") %>%
# one row per unique (variable, unit) pair
distinct(DC.Variable, DC.Unit) %>%
# collapse multiple units for the same variable into a single comma-separated
group_by(DC.Variable) %>%
summarise(units = paste(sort(unique(DC.Unit)), collapse = ", "),
.groups = "drop") %>%
arrange(DC.Variable)
# ── 3. Inspect / export ───────────────────────────────────────────────────────
print(table_units)
## # A tibble: 62 × 2
## DC.Variable units
## <chr> <chr>
## 1 ADF %, % DM, g, g/100g, g/100g DM, g/kg, g/kg DM
## 2 ADL %, % DM, g/100g, g/100g DM, g/kg, g/kg DM
## 3 Ash %, % DM, g/100g, g/100g DM, g/kg, g/kg DM, Unspecified
## 4 CF %, % DM, g/kg, g/kg DM
## 5 CFbr %, % DM, g/100g DM, g/kg, g/kg DM
## 6 CFibre %, g/100g, g/100g DM, g/kg, g/kg(Prosopis)/ % (diet), Unspecified
## 7 CH %, % DM, g/kg DM
## 8 CP %, % DM, %/(g/100g for pumpkin leaves), g, g/100g, g/100g DM, g/…
## 9 CT %, % DM, g/kg, g/kg DM, mg/g
## 10 Ca %, % DM, g/100g, g/kg, g/kg DM
## # ℹ 52 more rows
# write_csv(table_units, "variable_units_lookup.csv") # if you want a file
###############################################################################
## Fix the mis-labelled GE unit for four specific ingredients
## *and* only in the paper NN0106
###############################################################################
bad_items <- c(
"Biscuit Waste Meal Dried(Sun)",
"Leucaena leucocephala Leaves Dried",
"Maize",
"Wheat Offal"
)
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(
DC.Unit = if_else(
D.Item %in% bad_items & # right ingredient name
B.Code == "NN0106" & # right paper / source
DC.Variable == "GE" & # only Gross Energy rows
DC.Unit == "kcal/g", # only if label is wrong
"kcal/kg", # corrected label
DC.Unit
)
)
animals <- merged_metadata$Animals.Diet.Comp %>% mutate(value_num = suppressWarnings(as.numeric(DC.Value)))
# 1. nutrient (% / g-based) ➜ g / kg DM ========================================
vars_gkg <- c("NDF", "EE", "CP", "Ash")
conv_gkg <- tribble(
~DC.Unit, ~factor,
"%", 10,
"% DM", 10,
"g/100g", 10,
"g/100g DM", 10,
"g/kg", 1,
"g/kg DM", 1
)
animals <- animals %>%
left_join(conv_gkg, by = "DC.Unit") %>%
mutate(
value_std_gkg = if_else(DC.Variable %in% vars_gkg & !is.na(factor),
value_num * factor, NA_real_),
unit_std_gkg = if_else(!is.na(value_std_gkg), "g/kg", NA_character_)
) %>%
select(-factor)
# 2. energy ➜ MJ / kg (DM) =====================================================
vars_MJ <- c("GE", "ME", "NE")
conv_MJ <- tribble(
~DC.Unit, ~factorMJ,
"MJ/kg", 1,
"MJ/kg DM", 1,
"MJ/100g", 10,
"MJ/100g DM", 10,
"MJ/g", 1e3,
"kJ/kg", 0.001,
"kJ/kg DM", 0.001,
"kJ/100g", 0.01,
"kJ/100g DM", 0.01,
"kJ/g", 1,
"J/mg", 1, # J per mg ×1e-6 MJ/J ×1e6 mg/kg
"kcal/kg", 0.004184,
"kcal/kg DM", 0.004184,
"kcal/100g", 0.04184,
"kcal/g", 4.184,
"Mcal/kg", 4.184,
"Mcal/kg DM", 4.184
)
animals <- animals %>%
left_join(conv_MJ, by = "DC.Unit") %>%
mutate(
value_std_MJ = if_else(DC.Variable %in% vars_MJ & !is.na(factorMJ),
value_num * factorMJ, NA_real_),
unit_std_MJ = if_else(!is.na(value_std_MJ), "MJ/kg", NA_character_)
) %>%
select(-factorMJ)
# 3. overwrite DC.Value / DC.Unit where a conversion succeeded -----------------
animals <- animals %>%
mutate(
DC.Value = coalesce(value_std_gkg, value_std_MJ, value_num) |> format(scientific = FALSE),
DC.Unit = coalesce(unit_std_gkg, unit_std_MJ, DC.Unit)
) %>%
select(-value_num, -value_std_gkg, -value_std_MJ,
-unit_std_gkg, -unit_std_MJ)
# 4. save back into the metadata list ------------------------------------------
merged_metadata$Animals.Diet.Comp <- animals
# ── END CHUNK ──────────────────────────────────────────────────────────────────
coverage_raw <- merged_metadata$Animals.Diet.Comp %>%
filter(!is.na(DC.Variable)) %>%
mutate(
# Clean and detect actual missing values
DC.Value_clean = str_trim(as.character(DC.Value)),
DC.Value_clean = na_if(DC.Value_clean, ""),
DC.Value_clean = na_if(DC.Value_clean, "NA"),
DC.Value_clean = na_if(DC.Value_clean, "na")
) %>%
group_by(DC.Variable) %>%
summarise(
n_total = n(),
n_non_missing = sum(!is.na(DC.Value_clean)),
coverage_percent = round(100 * n_non_missing / n_total, 1),
.groups = "drop"
) %>%
arrange(desc(coverage_percent))
Feedipedia is an online database that provides comprehensive information on feed ingredients used in livestock nutrition. It aggregates data on the nutritional composition, digestibility, and other relevant properties of various feeds.We will use it to infer data when info is missing from our data extraction. Links each ingredient to its corresponding entry in Feedipedia, enabling supplementation of missing nutritional and digestibility data such as GE, CP, EE, and Ash.
## download data from feedapedia (generated with the "feedapedia scrapping script available on github)
# Use the raw URL of the file on GitHub
url <- "https://github.com/ERAgriculture/ERL/raw/main/downloaded_data/feedipedia.parquet"
# Create a temporary file location with a .parquet extension
temp_file <- tempfile(fileext = ".parquet")
# Download the file in binary mode
download.file(url, destfile = temp_file, mode = "wb")
# Read the Parquet file
feedipedia <- read_parquet(temp_file)
## download AOM with the correspondance between feedipedia codes and diet items
# Define the raw URL for the Excel file
url <- "https://github.com/Mlolita26/ERA_Docs/raw/main/era_master_sheet(14).xlsx"
temp_file <- tempfile(fileext = ".xlsx")
download.file(url, destfile = temp_file, mode = "wb")
mastersheet <- read_excel(temp_file, sheet = "AOM") %>%
filter(L5 == "Feed Ingredient")
mastersheet$Feedipedia <- as.numeric(sub(".*node/", "", mastersheet$Feedipedia))
feedipedia_unit_audit <- feedipedia %>%
mutate(
Variable = as.character(Variable),
Unit = trimws(Unit)
) %>%
group_by(Variable, Unit) %>%
summarise(n_rows = n(), .groups = "drop") %>%
arrange(Variable, desc(n_rows)) %>%
group_by(Variable) %>%
mutate(
n_units = n(),
flag = if_else(n_units > 1, "⚠︎ multiple units", "")
) %>%
ungroup()
# ── wide human-friendly table ---------------------------------------------
unit_summary <- feedipedia_unit_audit %>%
unite(unit_n, Unit, n_rows, sep = " (n = ") %>%
mutate(unit_n = paste0(unit_n, ")")) %>%
group_by(Variable, flag) %>%
summarise(units = paste(unit_n, collapse = "; "), .groups = "drop") %>%
arrange(Variable)
###7.1) Harmonize feedipedia units
# helper -------------------------------------------------------------
to_canonical <- function(val, unit, vclass){
u <- str_trim(tolower(unit))
nu <- unit # keep the original unit by default
v <- val # keep the original value by default
if (vclass == "energy"){ # ---------- ENERGY ----------
if (u %in% c("mcal/kg dm","mcal/kg","mcal/kg d.m.")){
v <- val * 4.184 ; nu <- "MJ/kg"
} else if (u %in% c("kcal/kg dm","kcal/kg")){
v <- val * 0.004184 ; nu <- "MJ/kg"
} else if (u %in% c("mj/kg","mj/kg dm")){
nu <- "MJ/kg"
}
} else { # -------- NUTRIENT ----------
if (u %in% c("% dm","%","% as fed")){
v <- val * 10 ; nu <- "g/kg"
} else if (u %in% c("mg/kg dm","mg/kg")){
v <- val / 1000 ; nu <- "g/kg"
} else if (u %in% c("g/100g dm","g/100g")){
v <- val * 10 ; nu <- "g/kg"
} else if (u == "g dm"){
nu <- "g/kg"
}
}
list(val = v, unit = nu)
}
# ── 0 · variable lists ──────────────────────────────────────────────────────
nutrition_vars <- str_trim(c(
"ADF", "Crude protein", "Ether extract", "Ash","NDF","CP"
))
energy_vars <- c( # ✱ NEW: long names added here ✱
"GE", "Gross energy",
"ME", "Metabolizable energy",
"NE",
"AME", "AMEn", "TME", "TMEn",
"ME ruminants", "ME ruminants (FAO, 1982)", "ME ruminants (gas production)",
"MEn growing pig", "MEn rabbit",
"NE growing pig"
)
# ── 1 · conversion ─────────────────────────────────────────────────────────
feedipedia <- feedipedia %>%
rowwise() %>%
mutate(
conv = list(
if (Variable %in% c(nutrition_vars, energy_vars)) # ← look in BOTH lists
to_canonical(
Avg, Unit,
if_else(Variable %in% energy_vars, "energy", "nutrient")
)
else list(val = Avg, unit = Unit) # leave untouched
)
) %>%
tidyr::unnest_wider(conv) %>%
mutate(Avg = val, Unit = unit) %>%
select(-val, -unit) %>%
ungroup()
# --------------------------------------------------------------------------- #
# A) Attach Feedipedia node to Comp & Digest #
# --------------------------------------------------------------------------- #
add_feedipedia_node <- function(df, mastersheet) {
df %>%
left_join(mastersheet %>% select(Edge_Value, Feedipedia),
by = c("D.Item" = "Edge_Value")) %>%
distinct()
}
merged_metadata$Animals.Diet.Comp <- add_feedipedia_node(merged_metadata$Animals.Diet.Comp, mastersheet)
merged_metadata$Animals.Diet.Digest <- add_feedipedia_node(merged_metadata$Animals.Diet.Digest, mastersheet)
# --------------------------------------------------------------------------- #
# B) Clean Feedipedia dump #
# --------------------------------------------------------------------------- #
digestibility_keywords <- c(
"digestibility","degradability","energy digestibility","DE","ME","NE","AMEn",
"OM digestibility","DM digestibility","ruminants","rabbit","pig","poultry",
"horse","salmonids","hydrolysis"
)
feedipedia <- feedipedia %>%
mutate(
Variable = case_when(
tolower(Variable) == "gross energy" ~ "GE",
tolower(Variable) == "crude protein" ~ "CP",
tolower(Variable) == "ether extract" ~ "EE",
tolower(Variable) == "ash" ~ "Ash",
tolower(Variable) == "neutral detergent fibre" ~ "NDF",
TRUE ~ Variable
),
category = case_when(
Variable %in% c("GE","CP","EE","Ash","NDF") ~ "nutrition",
str_detect(Variable, regex(paste(digestibility_keywords, collapse="|"), TRUE)) ~ "digestibility",
TRUE ~ "nutrition"
),
feedipedia_node = as.numeric(feedipedia_node),
page_node = as.numeric(page_node)
)
# --------------------------------------------------------------------------- #
# C) Node-level means + cross-walk (node ↔ page) #
# --------------------------------------------------------------------------- #
nutrition_vars <- c("GE","CP","EE","Ash","NDF")
## C1 – mean values per *true* node
feedipedia_nutri_node <- feedipedia %>%
filter(Variable %in% nutrition_vars) %>%
group_by(feedipedia_node, Variable, Unit) %>%
summarise(Value = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
rename(DC.Variable = Variable, DC.Value.feedipedia = Value)
feedipedia_dig_node <- feedipedia %>%
filter(str_detect(tolower(Variable), "energy digestibility")) %>%
mutate(DD.Variable = "DE") %>%
group_by(feedipedia_node, DD.Variable, Unit) %>%
summarise(DD.Value.feedipedia = mean(Avg, na.rm = TRUE), .groups = "drop")
## C2 – cross-walk: duplicate every node–page pair into two lookup IDs
id_pairs <- feedipedia %>% distinct(feedipedia_node, page_node)
id_xwalk <- bind_rows(
id_pairs %>% mutate(lookup_id = feedipedia_node),
id_pairs %>% mutate(lookup_id = page_node)
) %>%
filter(!is.na(lookup_id)) %>%
distinct(feedipedia_node, lookup_id)
## C3 – attach node means to BOTH lookup IDs
feedipedia_nutri_ids <- id_xwalk %>%
left_join(feedipedia_nutri_node, by = "feedipedia_node")
feedipedia_dig_ids <- id_xwalk %>%
left_join(feedipedia_dig_node, by = "feedipedia_node")
# --------------------------------------------------------------------------- #
# D) Source-tracking columns (create if missing) #
# --------------------------------------------------------------------------- #
if (!"nutrition_source" %in% names(merged_metadata$Animals.Diet.Comp))
merged_metadata$Animals.Diet.Comp$nutrition_source <- NA_character_
if (!"digestibility_source" %in% names(merged_metadata$Animals.Diet.Digest))
merged_metadata$Animals.Diet.Digest$digestibility_source <- NA_character_
# --------------------------------------------------------------------------- #
# E) Fill proximate-analysis gaps in Comp #
# --------------------------------------------------------------------------- #
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>%
left_join(feedipedia_nutri_ids,
by = c("Feedipedia" = "lookup_id", "DC.Variable")) %>%
mutate(
use_fp = is.na(DC.Value) & !is.na(DC.Value.feedipedia),
DC.Unit = if_else(use_fp, Unit, DC.Unit),
DC.Value = coalesce(DC.Value, DC.Value.feedipedia),
nutrition_source = if_else(use_fp, "feedipedia", nutrition_source)
) %>%
select(-DC.Value.feedipedia, -Unit, -use_fp)
# --------------------------------------------------------------------------- #
# F) Fill DE digestibility gaps in Digest #
# --------------------------------------------------------------------------- #
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
mutate(Feedipedia = as.numeric(Feedipedia)) %>%
left_join(feedipedia_dig_ids,
by = c("Feedipedia" = "lookup_id", "DD.Variable")) %>%
mutate(
use_fp = is.na(DD.Value) & !is.na(DD.Value.feedipedia),
DD.Unit = if_else(use_fp, Unit, DD.Unit),
DD.Value = coalesce(DD.Value, DD.Value.feedipedia),
digestibility_source = if_else(use_fp, "feedipedia", digestibility_source)
) %>%
select(-DD.Value.feedipedia, -Unit, -use_fp)
# --------------------------------------------------------------------------- #
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
distinct(B.Code, P.Product)
# Step 2: Filter Comp table for GE values
ge_papers <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
distinct(B.Code) %>%
left_join(species_by_bcode, by = "B.Code") %>%
count(P.Product, name = "papers_with_GE")
# View result
ge_papers
## P.Product papers_with_GE
## <char> <int>
## 1: Buffalo 4
## 2: Cattle 114
## 3: Chicken 59
## 4: Crustacean 1
## 5: Fish 58
## 6: Goat 171
## 7: Guinea Pig 1
## 8: Japanese Quail 4
## 9: Not Found 17
## 10: Pig 1
## 11: Pigs 29
## 12: Quail 2
## 13: Rabbit 8
## 14: Sheep 214
## 15: Snail 1
## 16: Turkey 1
## 17: Zebu Cattle 1
# 0 ▸ make sure the Feedipedia key exists and is numeric ---------------------
for (tbl in c("Animals.Diet.Comp", "Animals.Diet.Digest")) {
x <- merged_metadata[[tbl]]
# rename lower-case column if present
if (!"Feedipedia" %in% names(x) && "feedipedia" %in% names(x)) {
x <- dplyr::rename(x, Feedipedia = feedipedia)
}
# stop early if the key is still missing
if (!"Feedipedia" %in% names(x)) {
stop(sprintf("Column 'Feedipedia' is missing from %s.", tbl))
}
merged_metadata[[tbl]] <- mutate(x, Feedipedia = as.numeric(Feedipedia))
}
# 1 ▸ build reference tables (mean per node × variable × unit) ---------------
ref_nutri <- feedipedia %>%
filter(Variable %in% c("GE", "CP", "EE", "Ash", "NDF")) %>%
group_by(feedipedia_node, Variable, Unit) %>%
summarise(ref_val = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
mutate(Unit = str_trim(Unit))
ref_dig <- feedipedia %>%
filter(str_detect(tolower(Variable), "energy digestibility")) %>%
mutate(Variable = "DE") %>%
group_by(feedipedia_node, Variable, Unit) %>%
summarise(ref_val = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
mutate(Unit = str_trim(Unit))
# 2 ▸ nutrition QC -----------------------------------------------------------
qc_nutrition <- merged_metadata$Animals.Diet.Comp %>%
filter(!is.na(DC.Value)) %>% # raw value exists
mutate(DC.Unit = str_trim(DC.Unit)) %>%
left_join(ref_nutri,
by = c("Feedipedia" = "feedipedia_node",
"DC.Variable" = "Variable",
"DC.Unit" = "Unit")) %>%
filter(!is.na(ref_val)) %>% # have reference
mutate(rel_diff = abs(DC.Value - ref_val) / ref_val) %>%
filter(rel_diff > 0.10) %>% # > 10 % difference
arrange(desc(rel_diff))
# 3 ▸ digestibility QC -------------------------------------------------------
qc_digestibility <- merged_metadata$Animals.Diet.Digest %>%
filter(!is.na(DD.Value)) %>%
mutate(DD.Unit = str_trim(DD.Unit)) %>%
left_join(ref_dig,
by = c("Feedipedia" = "feedipedia_node",
"DD.Variable" = "Variable",
"DD.Unit" = "Unit")) %>%
filter(!is.na(ref_val)) %>%
mutate(rel_diff = abs(DD.Value - ref_val) / ref_val) %>%
filter(rel_diff > 0.10) %>%
arrange(desc(rel_diff))
# 4 ▸ quick report -----------------------------------------------------------
message(sprintf("⚠︎ Nutrition rows with >10 %% deviation: %s",
(nrow(qc_nutrition))))
## ⚠︎ Nutrition rows with >10 % deviation: 1601
message(sprintf("⚠︎ Digestibility rows with >10 %% deviation: %s",
(nrow(qc_digestibility))))
## ⚠︎ Digestibility rows with >10 % deviation: 1
# 5 ▸ optional interactive view ---------------------------------------------
# DT::datatable(qc_nutrition, caption = "Nutrition >10 % difference")
# DT::datatable(qc_digestibility, caption = "Digestibility >10 % difference")
# 1) Build node–page crosswalk from Feedipedia
id_pairs <- feedipedia %>%
distinct(feedipedia_node, page_node) %>%
filter(!is.na(feedipedia_node) & !is.na(page_node))
id_xwalk <- bind_rows(
id_pairs %>% mutate(type = "feedipedia_node", lookup_id = feedipedia_node),
id_pairs %>% mutate(type = "page_node", lookup_id = page_node)
) %>%
distinct(lookup_id, feedipedia_node, type)
# 2) Compare mastersheet codes with Feedipedia IDs
audit_fp <- mastersheet %>%
filter(!is.na(Feedipedia)) %>%
mutate(Feedipedia = as.numeric(Feedipedia)) %>%
left_join(id_xwalk, by = c("Feedipedia" = "lookup_id")) %>%
mutate(
status = case_when(
type == "feedipedia_node" ~ "OK (node match)",
type == "page_node" ~ "Wrong (page match → should use node)",
is.na(type) ~ "Unresolved (no match in feedipedia)"
)
) %>%
select(Edge_Value, Feedipedia, feedipedia_node, status) %>%
distinct()
## Warning in left_join(., id_xwalk, by = c(Feedipedia = "lookup_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 272 of `x` matches multiple rows in `y`.
## ℹ Row 949 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# 3) Inspect mis-matches
audit_fp %>%
arrange(status, Edge_Value) %>%
print(n = 50)
## # A tibble: 576 × 4
## Edge_Value Feedipedia feedipedia_node status
## <chr> <dbl> <dbl> <chr>
## 1 Acacia Albida 11539 11539 OK (node mat…
## 2 Acacia Albida Leaves 11538 11538 OK (node mat…
## 3 Acacia brevispica 11707 11707 OK (node mat…
## 4 Acacia brevispica Leaves 11706 11706 OK (node mat…
## 5 Acacia brevispica Pods Dried 11707 11707 OK (node mat…
## 6 Acacia karroo 12684 12684 OK (node mat…
## 7 Acacia nilotica 11794 11794 OK (node mat…
## 8 Acacia nilotica Leaves 11793 11793 OK (node mat…
## 9 Acacia nilotica Pods 11795 11795 OK (node mat…
## 10 Acacia nilotica Pods Dried 11794 11794 OK (node mat…
## 11 Acacia senegal 12174 12174 OK (node mat…
## 12 Acacia senegal Leaves 12173 12173 OK (node mat…
## 13 Acacia senegal Pods Dried 12174 12174 OK (node mat…
## 14 Acacia tortilis 12810 12810 OK (node mat…
## 15 Acacia tortilis Leaves 11512 11512 OK (node mat…
## 16 Acacia tortilis Leaves Dried Ground 12810 12810 OK (node mat…
## 17 Acacia tortilis Pods 12809 12809 OK (node mat…
## 18 Acacia tortilis Pods Dried 11513 11513 OK (node mat…
## 19 Acacia tortilis Pods Ground 11513 11513 OK (node mat…
## 20 Adansonia digitata 11821 11821 OK (node mat…
## 21 Adansonia digitata Seed 11824 11824 OK (node mat…
## 22 Albizia lebbeck 12243 12243 OK (node mat…
## 23 Albizia lebbeck Leaves 12243 12243 OK (node mat…
## 24 Albizia lebbeck Leaves Ground 12242 12242 OK (node mat…
## 25 Alfalfa 11745 11745 OK (node mat…
## 26 Alfalfa Dried 11744 11744 OK (node mat…
## 27 Alfalfa Dried Ground 11744 11744 OK (node mat…
## 28 Andropogon gayanus 12114 12114 OK (node mat…
## 29 Andropogon gayanus Dried 12116 12116 OK (node mat…
## 30 Andropogon gayanus Ensiled 12117 12117 OK (node mat…
## 31 Andropogon guianensis 12114 12114 OK (node mat…
## 32 Argan Pulp 11775 11775 OK (node mat…
## 33 Argan Seed Cake 11774 11774 OK (node mat…
## 34 Aristida adscensionis 12011 12011 OK (node mat…
## 35 Atella 22524 22524 OK (node mat…
## 36 Atella Dried 22524 22524 OK (node mat…
## 37 Atriplex halimus 25283 25283 OK (node mat…
## 38 Atriplex halimus Leaves and Twigs 22432 22432 OK (node mat…
## 39 Atriplex nummularia 22432 22432 OK (node mat…
## 40 Atriplex nummularia Leaves and Twigs 22432 22432 OK (node mat…
## 41 Azadirachta indica 12312 12312 OK (node mat…
## 42 Azadirachta indica Leaves 12312 12312 OK (node mat…
## 43 Balanites aegyptiaca 11591 11591 OK (node mat…
## 44 Balanites aegyptiaca Leaves 11591 11591 OK (node mat…
## 45 Bambara Groundnut 11555 11555 OK (node mat…
## 46 Bambara Groundnut Offal 11559 11559 OK (node mat…
## 47 Bambara Groundnut Offal Ground 11559 11559 OK (node mat…
## 48 Bamboo Leaves 12081 12081 OK (node mat…
## 49 Bambusa balcooa 11799 11799 OK (node mat…
## 50 Bambusa bambos 12081 12081 OK (node mat…
## # ℹ 526 more rows
Now that we have both raw and Feedipedia data, we can use them to infer missing values. If some instances of an ingredient have nutritional or digestibility data while others do not, we can impute the missing values using the average of the available ones.
# --- Standardize Units in Animals.Diet.Comp ---
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(
# 1) normalise text --------------------------------------------------------
DC.Unit = tolower(trimws(DC.Unit)),
DC.Value = as.numeric(DC.Value)
) %>%
# 2) convert values ----------------------------------------------------------
mutate(
DC.Value = case_when(
# --- gross energy → MJ kg-1 -------------------------------------------
DC.Variable == "GE" ~ case_when(
DC.Unit %in% c("mj/kg", "mj/kg dm") ~ DC.Value,
DC.Unit %in% c("kcal/kg", "kcal/kg dm") ~ DC.Value * 0.004184,
DC.Unit == "kcal/g" ~ DC.Value * 4.184,
DC.Unit == "kcal/100g" ~ DC.Value * 0.04184, # fixed
DC.Unit == "cal/g" ~ DC.Value * 0.004184, # fixed
DC.Unit == "mcal/kg" ~ DC.Value * 4.184,
DC.Unit %in% c("kj/kg", "kj/kg dm") ~ DC.Value / 1000,
DC.Unit == "kj/g" ~ DC.Value, # already MJ/kg
DC.Unit %in% c("kj/100g", "kj/100g dm", "kj/100 g dm") ~ DC.Value * 0.01,
DC.Unit == "mj/100g" ~ DC.Value * 10,
DC.Unit == "j/mg" ~ DC.Value, # fixed
TRUE ~ NA_real_
),
# --- everything else → g kg-1 -----------------------------------------
TRUE ~ case_when(
DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
DC.Unit %in% c("g", "g/kg", "g/kg dm") ~ DC.Value,
DC.Unit %in% c("kg", "kg/day") ~ DC.Value * 1000,
TRUE ~ NA_real_
)
),
# 3) harmonised unit labels ----------------------------------------------
DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", "g/kg")
)
## ── 7.3 · override large raw-vs-Feedipedia discrepancies ────────────────────
library(dplyr)
deviation_threshold <- 0.10 # >10 % deviation → treat raw value as unreliable
override_if_far <- function(data,
ref_tbl,
value_col,
unit_col,
var_col,
node_col,
src_col,
threshold = deviation_threshold) {
# 0 ▸ add a copy of Unit that won’t be used as a key (so it survives the join)
ref_tbl <- ref_tbl %>% mutate(ref_unit = Unit)
# 1 ▸ build the join map (x = data, y = ref_tbl)
join_cols <- setNames(
c("feedipedia_node", "Variable", "Unit"), # cols in ref_tbl (y)
c(node_col, var_col, unit_col) # cols in data (x)
)
data %>%
# 2 ▸ attach the reference mean (with the extra `ref_unit`)
left_join(ref_tbl, by = join_cols) %>%
# 3 ▸ flag rows whose raw value deviates by more than `threshold`
mutate(
rel_diff = abs(.data[[value_col]] - ref_val) / ref_val,
override = !is.na(rel_diff) & rel_diff > threshold
) %>%
# 4 ▸ overwrite value/unit and record provenance when override is TRUE
mutate(
!!value_col := if_else(override, ref_val, .data[[value_col]]),
!!unit_col := if_else(override, ref_unit, .data[[unit_col]]),
!!src_col := case_when(
override ~ "feedipedia_override_large_diff",
TRUE ~ .data[[src_col]]
)
) %>%
# 5 ▸ clean up helpers
select(-ref_val, -rel_diff, -override, -ref_unit)
}
# ── Apply to nutrition -------------------------------------------------------
merged_metadata$Animals.Diet.Comp <- override_if_far(
data = merged_metadata$Animals.Diet.Comp,
ref_tbl = ref_nutri, # built earlier
value_col = "DC.Value",
unit_col = "DC.Unit",
var_col = "DC.Variable",
node_col = "Feedipedia",
src_col = "nutrition_source"
)
# ── Apply to digestibility ---------------------------------------------------
merged_metadata$Animals.Diet.Digest <- override_if_far(
data = merged_metadata$Animals.Diet.Digest,
ref_tbl = ref_dig, # built earlier
value_col = "DD.Value",
unit_col = "DD.Unit",
var_col = "DD.Variable",
node_col = "Feedipedia",
src_col = "digestibility_source"
)
# ── Quick log ----------------------------------------------------------------
message(sprintf(
"✅ Nutrition rows corrected: %s",
scales::comma(sum(merged_metadata$Animals.Diet.Comp$nutrition_source ==
"feedipedia_override_large_diff", na.rm = TRUE))
))
## ✅ Nutrition rows corrected: 1,610
message(sprintf(
"✅ Digestibility rows corrected: %s",
scales::comma(sum(merged_metadata$Animals.Diet.Digest$digestibility_source ==
"feedipedia_override_large_diff", na.rm = TRUE))
))
## ✅ Digestibility rows corrected: 1
# --- Compute Mean Value Per Ingredient and Variable ---
ingredient_means <- merged_metadata$Animals.Diet.Comp %>%
filter(is_entire_diet == FALSE, !is.na(DC.Value)) %>%
group_by(D.Item, DC.Variable) %>%
summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
left_join(ingredient_means, by = c("D.Item", "DC.Variable")) %>%
mutate(
nutrition_source = case_when(
nutrition_source == "feedipedia" ~ "feedipedia", # preserve Feedipedia
is.na(DC.Value) & !is_entire_diet & !is.na(mean_value) ~ "inferred_from_same_ingredient", # imputed
is.na(nutrition_source) & !is.na(DC.Value) ~ "raw data", # only if there's a value
TRUE ~ nutrition_source # keep whatever is there
),
DC.Value = if_else(
is.na(DC.Value) & !is_entire_diet & !is.na(mean_value),
mean_value,
DC.Value
)
) %>%
select(-mean_value)
# --- B) Compute Mean Digestibility Per Ingredient and Variable ---
digest_means <- merged_metadata$Animals.Diet.Digest %>%
filter(DD.Variable == "DE", is_entire_diet == FALSE, !is.na(DD.Value)) %>%
group_by(D.Item, DD.Variable, DD.Unit) %>%
summarise(mean_digest = mean(DD.Value, na.rm = TRUE), .groups = "drop")
# --- C) Fill Missing Digestibility Values with Ingredient Mean ---
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
left_join(digest_means, by = c("D.Item", "DD.Variable")) %>%
mutate(
digestibility_source = case_when(
digestibility_source == "feedipedia" ~ "feedipedia",
is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest) ~ "inferred_from_same_ingredient",
is.na(digestibility_source) ~ "raw data",
TRUE ~ digestibility_source
),
DD.Value = if_else(
is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest),
mean_digest,
DD.Value
)
) %>%
select(-mean_digest)
## Warning in left_join(., digest_means, by = c("D.Item", "DD.Variable")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4490 of `x` matches multiple rows in `y`.
## ℹ Row 179 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
since there is only very few digestibility in the raw data logic that we dont gain any data by doing this because feedipedia is infered for all the ingredients we have the info for.
# ✅ Count how many papers had GE filled by Feedipedia, and how many of those are Cattle / Sheep / Goat
# Step 1: Filter for Feedipedia-filled GE entries
ge_filled_by_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "GE", nutrition_source == "feedipedia") %>%
distinct(B.Code)
# Step 2: Total number of studies
n_ge_feedipedia <- nrow(ge_filled_by_feedipedia)
# Step 3: Add species info and filter to ruminants
ge_species_feedipedia <- ge_filled_by_feedipedia %>%
left_join(merged_metadata$`Prod.Out` %>% select(B.Code, P.Product), by = "B.Code") %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product)
# Step 4: Count per species
# ge_species_counts <- ge_species_feedipedia %>%
# count(P.Product, name = "Papers_with_GE_from_Feedipedia")
#
# # Display summary
# cat("📌 Total papers with GE filled from Feedipedia:", n_ge_feedipedia, "\n")
# cat("📌 Of those, species breakdown:\n")
# print(ge_species_counts)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
distinct(B.Code, P.Product)
# # Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
# filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
# distinct(B.Code) %>%
# left_join(species_by_bcode, by = "B.Code") %>%
# count(P.Product, name = "papers_with_GE")
#
# # View result
# ge_papers
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")
# Initialize result containers
species_summary <- list()
bcode_table <- list()
# Loop over each species
for (species in species_list) {
# All B.Codes for this species
species_bcodes <- merged_metadata$Prod.Out %>%
filter(P.Product == species) %>%
pull(B.Code) %>%
unique()
# Feed intake B.Codes
feed_data <- merged_metadata$Data.Out %>%
filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
select(B.Code, D.Item = ED.Intake.Item)
feed_bcodes <- unique(feed_data$B.Code)
# GE B.Codes
ge_data <- merged_metadata$Animals.Diet.Comp %>%
filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
select(B.Code, D.Item)
ge_bcodes <- unique(ge_data$B.Code)
# Matched B.Codes (join on both B.Code and D.Item)
matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
pull(B.Code) %>%
unique()
# Store species-level summary
species_summary[[species]] <- tibble(
Species = species,
Total_BCodes = length(species_bcodes),
With_FEED = length(feed_bcodes),
With_GE = length(ge_bcodes),
With_BOTH = length(matched_bcodes)
)
# Store long-format bcode list
bcode_table[[species]] <- tibble(
B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"),
times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
Species = species
) %>% distinct()
}
# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)
# Show results
print(species_summary_df)
## # A tibble: 3 × 5
## Species Total_BCodes With_FEED With_GE With_BOTH
## <chr> <int> <int> <int> <int>
## 1 Cattle 126 126 115 38
## 2 Sheep 223 223 214 106
## 3 Goat 186 185 173 72
head(bcode_table_df)
## # A tibble: 6 × 3
## B.Code Category Species
## <chr> <chr> <chr>
## 1 HK0263 All Cattle
## 2 AG0051 All Cattle
## 3 NN0356 All Cattle
## 4 NN0399 All Cattle
## 5 JS0269.2 All Cattle
## 6 JS0270 All Cattle
###7.3 Coverage after Feedipedia
coverage_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
filter(!is.na(DC.Variable)) %>%
mutate(
# Clean and detect actual missing values
DC.Value_clean = str_trim(as.character(DC.Value)),
DC.Value_clean = na_if(DC.Value_clean, ""),
DC.Value_clean = na_if(DC.Value_clean, "NA"),
DC.Value_clean = na_if(DC.Value_clean, "na")
) %>%
group_by(DC.Variable) %>%
summarise(
n_total = n(),
n_non_missing = sum(!is.na(DC.Value_clean)),
coverage_percent = round(100 * n_non_missing / n_total, 1),
.groups = "drop"
) %>%
arrange(desc(coverage_percent))
## Link data to ILRI database
download.file(url, destfile = temp_file, mode = "wb")
# ---- Load ILRI nutrition data ----
ILRI <- read_excel(temp_file, sheet = "ssa_feedsdb", col_types = "text") %>%
janitor::clean_names()
## New names:
## • `ilri_feedname` -> `ilri_feedname...4`
## • `ilri_feedtype` -> `ilri_feedtype...5`
## • `ilri_feedcode` -> `ilri_feedcode...6`
## • `` -> `...27`
## • `ilri_feedcode` -> `ilri_feedcode...28`
## • `ilri_feedname` -> `ilri_feedname...29`
## • `ilri_feedtype` -> `ilri_feedtype...30`
names(ILRI) <- gsub("\\.\\.\\.[0-9]+", "", names(ILRI))
# Reshape ILRI data and standardize units
ilri_long <- ILRI %>%
select(ilri_feedcode = ilri_feedcode_6, cp, om, ndf, adf, adl, dm, me, n_el) %>%
pivot_longer(
cols = -ilri_feedcode,
names_to = "DC.Variable",
values_to = "ILRI_Value"
) %>%
mutate(
DC.Variable = toupper(DC.Variable),
ILRI_Value = case_when(
DC.Variable %in% c("ME", "NEL", "NEM", "NEG") ~ as.numeric(ILRI_Value),
TRUE ~ as.numeric(ILRI_Value) * 10 # convert % DM to g/kg DM
),
ILRI_Unit = case_when(
DC.Variable %in% c("ME", "NEL", "NEM", "NEG") ~ "MJ/kg DM",
TRUE ~ "g/kg DM"
)
)
# ---- Load and clean AOM mapping sheet ----
aom_map <- read_excel(temp_file, sheet = "AOM", col_types = "text") %>%
janitor::clean_names() %>%
filter(l5 == "Feed Ingredient") %>%
select(edge_value, ilri_code) %>%
mutate(ilri_code = as.character(ilri_code))
# Find feeds present in both ILRI and AOM based on ILRI code
common_feeds <- inner_join(
ilri_long %>% distinct(ilri_feedcode),
aom_map %>% distinct(ilri_code),
by = c("ilri_feedcode" = "ilri_code")
)
# ---- Merge ILRI feed codes into diet composition ----
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
left_join(aom_map, by = c("D.Item" = "edge_value"))%>%
distinct()
## Warning in left_join(., aom_map, by = c(D.Item = "edge_value")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1285 of `x` matches multiple rows in `y`.
## ℹ Row 491 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
join nutritional values
# ---- Join nutritional values from ILRI ----
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
left_join(ilri_long, by = c("ilri_code" = "ilri_feedcode", "DC.Variable"))%>%
distinct()
## Warning in left_join(., ilri_long, by = c(ilri_code = "ilri_feedcode", "DC.Variable")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 5718 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
fill missing values from ILRI database
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(
to_fill = is.na(DC.Value) & !is.na(ILRI_Value),
DC.Value = if_else(to_fill, ILRI_Value, DC.Value),
DC.Unit = if_else(to_fill, ILRI_Unit, DC.Unit),
nutrition_source = if_else(to_fill, "ilri", nutrition_source)
) %>%
select(-ILRI_Value, -ILRI_Unit, -to_fill)
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
mutate(
DC.Unit = case_when(
DC.Unit == "g/kg DM" ~ "g/kg",
DC.Unit == "MJ/kg DM" ~ "MJ/kg",
TRUE ~ DC.Unit
)
)%>%
distinct()
coverage_ILRI <- merged_metadata$Animals.Diet.Comp %>%
filter(!is.na(DC.Variable)) %>%
mutate(
# Clean and detect actual missing values
DC.Value_clean = str_trim(as.character(DC.Value)),
DC.Value_clean = na_if(DC.Value_clean, ""),
DC.Value_clean = na_if(DC.Value_clean, "NA"),
DC.Value_clean = na_if(DC.Value_clean, "na")
) %>%
group_by(DC.Variable) %>%
summarise(
n_total = n(),
n_non_missing = sum(!is.na(DC.Value_clean)),
coverage_percent = round(100 * n_non_missing / n_total, 1),
.groups = "drop"
) %>%
arrange(desc(coverage_percent))
coverage_summary <- bind_rows(
coverage_raw %>% mutate(Source = "raw"),
coverage_feedipedia %>% mutate(Source = "feedipedia"),
coverage_ILRI %>% mutate(Source = "ILRI")
)
# Pivot to wide format: one row per variable, columns for each source
coverage_summary_wide <- coverage_summary %>%
select(DC.Variable, Source, coverage_percent) %>%
pivot_wider(names_from = Source, values_from = coverage_percent)
# Optional: sort by raw coverage
coverage_summary_wide <- coverage_summary_wide %>%
arrange(desc(raw))
ingredients<-ingredients%>%
filter(D.Item!="Salt")%>%
filter(D.Item!="Water")
##— build the control key from your merged MT.Out
ctrl_key <- merged_metadata$MT.Out[, .(
B.Code,
A.Level.Name,
T.Name,
T.Control
)]
##— example join back onto your ingredient table
# (assuming your ingredients DT has B.Code + A.Level.Name)
ingredients <- merge(
x = ingredients,
y = ctrl_key,
by = c("B.Code","A.Level.Name"),
all.x = TRUE,
sort = FALSE
)
This table presents the relative percentage of each ingredient within a diet. To ensure accurate proportion calculations, we first filtered the dataset to include only diets with consistent units. Diets with incompatible or non-harmonizable units were excluded, as their proportions could not be reliably determined.
# ────────────────────────────────────────────────────────────────────────────
# (a) Ingredient-based totals ----------------------------------------------
consistent_diets <- ingredients %>% # ignore “Entire Diet” rows
filter(D.Type != "Entire Diet") %>%
group_by(B.Code, A.Level.Name) %>%
mutate(Unit_Count = n_distinct(D.Unit.Amount)) %>% # always 1 after conversion
filter(Unit_Count == 1) %>% # keep only single-unit diets
ungroup()
diet_completeness <- consistent_diets %>%
group_by(B.Code, A.Level.Name) %>%
summarise(
n_total = n(),
n_nonmissing = sum(!is.na(D.Amount)),
.groups = "drop"
) %>%
filter(n_total == n_nonmissing)
total_ingr <- consistent_diets %>%
semi_join(diet_completeness, by = c("B.Code", "A.Level.Name")) %>%
group_by(B.Code, A.Level.Name, D.Unit.Amount) %>%
summarise(Total_Amount_ingr = sum(D.Amount), .groups = "drop")
# ────────────────────────────────────────────────────────────────────────────
# (b) Whole-diet feed-intake rows ------------------------------------------
# Use the ORIGINAL ‘feed_intake’ table, because that still carries
# ED.Mean.T and Out.Unit.Amount.
feed_intake_diet_g <- feed_intake %>%
filter(is_entire_diet) %>% # only full-diet rows
mutate(
Intake_Amount_g = case_when( # kg → g
Out.Unit.Amount == "kg" ~ ED.Mean.T * 1000,
Out.Unit.Amount == "g" ~ ED.Mean.T,
TRUE ~ NA_real_
),
D.Unit.Amount = "g" # unify the unit
) %>%
select(B.Code, A.Level.Name, D.Unit.Amount, Intake_Amount_g)
# ────────────────────────────────────────────────────────────────────────────
# (c) Merge the two sources, prefer feed-intake -----------------------------
total_amounts <- full_join(
feed_intake_diet_g,
total_ingr,
by = c("B.Code", "A.Level.Name", "D.Unit.Amount")
) %>%
mutate(
Total_Amount = coalesce(Intake_Amount_g, Total_Amount_ingr)
) %>%
select(B.Code, A.Level.Name, D.Unit.Amount, Total_Amount)
# ────────────────────────────────────────────────────────────────────────────
# 2. Check unit correspondence ---------------------------------------------
unit_check <- ingredients %>%
select(B.Code, A.Level.Name, Ingredient_Unit = D.Unit.Amount) %>%
distinct() %>%
left_join(
total_amounts %>%
select(B.Code, A.Level.Name, Total_Unit = D.Unit.Amount),
by = c("B.Code", "A.Level.Name")
) %>%
mutate(Consistent_Units = Ingredient_Unit == Total_Unit)
# Keep only diets whose units match
consistent_unit_diets <- unit_check %>%
filter(Consistent_Units) %>%
select(B.Code, A.Level.Name)
ingredients_clean <- ingredients %>%
semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))
totals_clean <- total_amounts %>%
semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))
# ────────────────────────────────────────────────────────────────────────────
# 3. Percentage contribution of each ingredient -----------------------------
diet_percentages <- ingredients_clean %>%
filter(D.Type != "Entire Diet") %>%
left_join(totals_clean,
by = c("B.Code", "A.Level.Name", "D.Unit.Amount")) %>%
mutate(
Percentage_of_Diet = ifelse(Total_Amount > 0,
(D.Amount / Total_Amount) * 100,
NA_real_)
) %>%
select(
B.Code, A.Level.Name, D.Item, D.Type,
D.Amount, D.Unit.Amount, Percentage_of_Diet
)
# (optional) remove helpers --------------------------------------------------
rm(consistent_diets, diet_completeness, total_ingr,
feed_intake_diet_g, unit_check, consistent_unit_diets)
# Check if each diet sums close to 100%
validation <- diet_percentages %>%
group_by(B.Code, A.Level.Name) %>%
summarise(Total_Percentage = sum(Percentage_of_Diet, na.rm=TRUE)) %>%
ungroup()
# Display diets and their total percentages
# datatable(
# validation,
# options = list(
# pageLength = 10, # Show 10 rows per page by default
# scrollX = TRUE, # Enable horizontal scrolling
# autoWidth = TRUE, # Adjust column width automatically
# searchHighlight = TRUE # Highlight search terms
# ),
# caption = "Validation table"
# )
The lines that have 0 as validation are the ones that were feeding ad libitum for which we do not have a precise amount so it’s impossible to calculate the relative proportion of the ingredients.
Enteric methane (CH₄) emissions were estimated using the IPCC Tier 2 methodology, which incorporates animal-specific data such as gross energy intake (GE) and methane conversion factors (Ym). This approach improves accuracy over Tier 1 by reflecting diet composition, feed intake, and species-specific differences.
For GE values reported at the entire diet level, daily gross energy intake per animal was calculated as:
\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]
Where:
To do that we will have ton convert all feed intake data into kg/day/animal.
When GE values were reported at the ingredient level, and information on diet composition (ingredient proportions) was available, the diet-level GE was reconstructed as:
\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]
Where:
Then, daily intake was computed using the same formula above.
Annual methane emissions were calculated using the IPCC Tier 2 equation:
\[ \text{CH}_4 = \frac{\text{GE}_{\text{daily}} \times \text{Ym} \times 365}{55.65} \]
Where:
1. Table of Diet Composition Proportions
This table defines the proportion of each ingredient in the overall diet:
# Create an interactive table for "total_amounts"
datatable(
diet_percentages,
options = list(
pageLength = 10, # Show 10 rows per page by default
scrollX = TRUE, # Enable horizontal scrolling
autoWidth = TRUE, # Adjust column width automatically
searchHighlight = TRUE # Highlight search terms
),
caption = "Interactive Table: Relative % of Ingredients"
)
Used to compute reconstructed GE_diet from
ingredient-level GE values:
\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]
2. Table of Ingredient Dry Amounts
This table stores the amount of each ingredient fed per animal per day, already in or converted to dry matter:
# This 'ingredients' now contains all diets + basal integrated.
# Table 1: Final diet amounts with harmonized units (including basal)
datatable(
ingredients,
options = list(
pageLength = 10, # Show 10 rows per page by default
scrollX = TRUE, # Enable horizontal scrolling
autoWidth = TRUE, # Adjust column width automatically
searchHighlight = TRUE # Highlight search terms
),
caption = "Interactive Table: Diet Ingredients"
)
Useful when GE is per ingredient and diet is not proportionally described.
3. Table of Total Diet Dry Amounts
This is the total daily dry matter offered or consumed per animal, either directly from intake or inferred from ingredient sums:
datatable(
total_amounts,
options = list(
pageLength = 10, # Show 10 rows per page by default
scrollX = TRUE, # Enable horizontal scrolling
autoWidth = TRUE, # Adjust column width automatically
searchHighlight = TRUE # Highlight search terms
),
caption = "Interactive Table: Total Amounts of Diet Ingredients"
)
\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 115 |
| Goat | 186 | 173 |
| Sheep | 223 | 214 |
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "GE") %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 115 |
| Goat | 186 | 173 |
| Sheep | 223 | 214 |
# Inspect the units used for GE
Gross_energy <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "GE")
Gross_energy$DC.Value<-as.numeric(Gross_energy$DC.Value)
unique(Gross_energy$DC.Unit)
## [1] "g/kg" "MJ/kg"
Gross_energy <- Gross_energy %>%
# STEP 1: Create GE_source from nutrition_source
mutate(
GE_source = case_when(
nutrition_source == "feedipedia" ~ "feedipedia",
nutrition_source == "inferred_from_same_ingredient" ~ "inferred_from_same_ingredient",
nutrition_source == "raw data" ~ "raw data",
TRUE ~ NA_character_
)
)%>%
select(-"nutrition_source")
g/kg ❌ Not an energy unit Likely a mistake, investigate or exclude % / % DM / % as fed ❌ Not energy units Same as above Unspecified ❌ Ambiguous Investigate or exclude
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 115 |
| Goat | 186 | 173 |
| Sheep | 223 | 214 |
Gross_energy_check <- Gross_energy %>%
filter(!is.na(DC.Value))%>%
filter(GE_source=="raw data")
library(dplyr)
library(stringr)
# ── A · Define patterns for name-based matching ───────────────────────────────
salt_pat <- "(?i)\\b(salt|sodium\\s*chloride|nacl|mineral\\s*(lick|block))\\b"
water_pat <- "(?i)\\bwater\\b"
# false positives to exclude (extend if you see more)
exclude_pat <- "(?i)\\b(water\\s*hyacinth|saltbush)\\b"
# ── B · Type-based lookup (as you had) ────────────────────────────────────────
nonenergy_types <- c("Non-ERA Feed", "Supplement") # add more types if you have them
type_lookup <- merged_metadata$Animals.Diet %>%
filter(D.Type %in% nonenergy_types) %>%
distinct(B.Code, D.Item) %>%
mutate(flag_nonenergy = TRUE)
# ── C · Name-based lookup for salt/water (avoids false positives) ─────────────
name_lookup <- merged_metadata$Animals.Diet %>%
mutate(item_lc = str_squish(str_to_lower(D.Item))) %>%
filter(
(str_detect(item_lc, salt_pat) | str_detect(item_lc, water_pat)) &
!str_detect(item_lc, exclude_pat)
) %>%
distinct(B.Code, D.Item) %>%
mutate(flag_nonenergy = TRUE)
# ── D · Union both lookups (type + name) ──────────────────────────────────────
nonenergy_lookup <- bind_rows(type_lookup, name_lookup) %>%
distinct(B.Code, D.Item, .keep_all = TRUE)
# ── E · Ensure every (B.Code, D.Item) in lookup has a GE row (fill 0 if missing)
missing_nonenergy_ge <- nonenergy_lookup %>%
anti_join(
Gross_energy %>% filter(DC.Variable == "GE") %>% select(B.Code, D.Item),
by = c("B.Code", "D.Item")
) %>%
transmute(
B.Code, D.Item,
DC.Variable = "GE",
DC.Value = 0,
DC.Unit = "MJ/kg",
GE_source = "assumed_zero"
)
# ── F · Bind, tag, and overwrite all matching rows to zero GE ─────────────────
Gross_energy <- Gross_energy %>%
bind_rows(missing_nonenergy_ge) %>%
left_join(nonenergy_lookup, by = c("B.Code", "D.Item")) %>%
mutate(
DC.Value = if_else(DC.Variable == "GE" & flag_nonenergy, 0, DC.Value),
DC.Unit = if_else(DC.Variable == "GE" & flag_nonenergy, "MJ/kg", DC.Unit),
GE_source = if_else(DC.Variable == "GE" & flag_nonenergy, "assumed_zero", GE_source)
) %>%
select(-flag_nonenergy)
# tidy up
To estimate Gross Energy (GE) when direct measurements are not available, we use a predictive equation based on proximate nutritional components: crude protein (CP), ether extract (EE), and ash. This method provides an accurate approximation of the energy content of feed ingredients based on their macronutrient composition.
The equation used is adapted from Weiss et al. (2019) and is expressed as:
GE (Mcal/kg) = (0.056 × CP) + (0.094 × EE) + [0.042 × (100 − CP − EE − Ash)]
GE (MJ/kg) = GE (Mcal/kg) × 4.184
# --- STEP 1: Prepare predictors (CP, EE, Ash) as % of DM ---
# Keep relevant metadata for later
metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
distinct()
ge_predictors <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable %in% c("CP", "EE", "Ash")) %>%
mutate(
DC.Unit = trimws(tolower(DC.Unit)),
DC.Value = as.numeric(DC.Value),
# Harmonize to % of DM
DC.Value = case_when(
DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value,
DC.Unit %in% c("g", "g/kg", "g/kg dm") ~ DC.Value / 10,
DC.Unit %in% c("kg", "kg/day") ~ DC.Value * 100,
TRUE ~ NA_real_
),
DC.Unit = "%"
) %>%
group_by(B.Code, D.Item, DC.Variable) %>%
summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
left_join(metadata_cols, by = c("B.Code", "D.Item"))
# --- STEP 2: Apply Weiss GE equation only where CP, EE, Ash are all present ---
ge_predicted <- ge_predictors %>%
mutate(
GE_Mcal_per_kg = ifelse(
!is.na(CP) & !is.na(EE) & !is.na(Ash),
(CP * 0.056) + (EE * 0.094) + ((100 - CP - EE - Ash) * 0.042),
NA_real_
),
GE_MJ_per_kg = GE_Mcal_per_kg * 4.184
) %>%
select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)
# --- STEP 3: Merge and only impute where GE is missing ---
Gross_energy <- Gross_energy %>%
# STEP 2: Join GE predictions (e.g. from Weiss)
full_join(ge_predicted, by = c("B.Code", "D.Item","is_entire_diet","Feedipedia")) %>%
# STEP 3: Only update GE_source if it is still missing
mutate(
GE_source = case_when(
is.na(GE_source) & !is.na(GE_MJ_per_kg) & is.na(DC.Value) ~ "predicted_from_Weiss_GE",
TRUE ~ GE_source
),
DC.Value = if_else(is.na(DC.Value), GE_MJ_per_kg, DC.Value),
DC.Unit = "MJ/kg",
DC.Variable = "GE"
) %>%
select(-GE_MJ_per_kg)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 119 |
| Goat | 186 | 182 |
| Sheep | 223 | 221 |
As we did in 9) we are inferring again GE data after estimation by proximal nutritional composition.
library(dplyr)
# 1) Per-ingredient mean GE (exclude entire diets; treat NA as FALSE just for this calc)
ingredient_means <- Gross_energy %>%
mutate(`_is_entire_diet` = coalesce(is_entire_diet, FALSE)) %>% # NA -> FALSE
filter(DC.Variable == "GE", `_is_entire_diet` == FALSE, !is.na(DC.Value)) %>%
group_by(D.Item) %>%
summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")
# 2) Join and fill missing GE for ingredient rows only
Gross_energy <- Gross_energy %>%
left_join(ingredient_means, by = "D.Item") %>%
mutate(`_is_entire_diet` = coalesce(is_entire_diet, FALSE)) %>% # NA -> FALSE (decision only)
mutate(
GE_source = case_when(
DC.Variable == "GE" & is.na(DC.Value) & `_is_entire_diet` == FALSE & !is.na(mean_value)
~ "inferred_from_same_ingredient-Step2",
TRUE ~ GE_source
),
DC.Value = if_else(
DC.Variable == "GE" & is.na(DC.Value) & `_is_entire_diet` == FALSE & !is.na(mean_value),
mean_value,
DC.Value
),
# normalize unit whenever GE has a value (original or imputed)
DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", DC.Unit)
) %>%
select(-mean_value, -`_is_entire_diet`)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 120 |
| Goat | 186 | 182 |
| Sheep | 223 | 221 |
# --- STEP 4: Prepare predictors for Weiss DE equation ---
# --- STEP 1: Store relevant metadata before pivot ---
de_metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
distinct()
# --- STEP 2: Prepare DE predictors and harmonize units ---
de_predictors <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable %in% c("CP", "EE", "Ash", "NDF")) %>%
mutate(
DC.Unit = trimws(tolower(DC.Unit)),
DC.Value = as.numeric(DC.Value),
# Harmonize to g/kg DM
DC.Value = case_when(
DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
DC.Unit %in% c("g", "g/kg", "g/kg dm") ~ DC.Value,
DC.Unit %in% c("kg", "kg/day") ~ DC.Value * 1000,
TRUE ~ NA_real_
),
DC.Unit = "g/kg"
) %>%
group_by(B.Code, D.Item, DC.Variable) %>%
summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
left_join(de_metadata_cols, by = c("B.Code", "D.Item"))
# --- STEP 3: Apply Weiss DE equation and convert to GE ---
de_predicted <- de_predictors %>%
mutate(
DE_Mcal_per_kg = ifelse(
!is.na(CP) & !is.na(EE) & !is.na(Ash) & !is.na(NDF),
1.044 + 0.0101 * CP + 0.0161 * EE - 0.0037 * Ash - 0.0023 * NDF,
NA_real_
),
DE_MJ_per_kg = DE_Mcal_per_kg * 4.184,
GE_MJ_per_kg = ifelse(!is.na(DE_MJ_per_kg), DE_MJ_per_kg / 0.54, NA_real_)
) %>%
select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)
# --- STEP 6: Fill remaining missing GE using DE-based predictions only if GE is still missing ---
Gross_energy <- full_join(Gross_energy, de_predicted, by = c("B.Code", "D.Item", "is_entire_diet", "Feedipedia")) %>%
mutate(
DC.Value = if_else(is.na(DC.Value) & !is.na(GE_MJ_per_kg), GE_MJ_per_kg, DC.Value),
GE_source = case_when(
is.na(GE_source) & is.na(DC.Value) & !is.na(GE_MJ_per_kg) ~ "predicted_from_Weiss_DE",
TRUE ~ GE_source # keep existing label (e.g., feedipedia, raw data, inferred, or missing)
),
DC.Unit = "MJ/kg",
DC.Variable = "GE"
) %>%
select(-GE_MJ_per_kg)
GE_data <- Gross_energy %>%
filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 120 |
| Goat | 186 | 182 |
| Sheep | 223 | 221 |
We dont get more papers seems logical because probably the same ingredient as the equation for GE.
Converts digestible energy (DE) to gross energy (GE) using a default digestibility factor (54%) when DE is reported but GE is not.
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Total_Studies")
# 2. All papers with GE (non-NA)
DE_data <- merged_metadata$Animals.Diet.Comp %>%
filter(DC.Variable == "DE", !is.na(DC.Value) & DC.Value != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
DE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(DE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_DE")
# 4. Combine the two
summary_table <- species_counts %>%
full_join(DE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_DE |
|---|---|---|
| Cattle | 126 | 0 |
| Goat | 186 | 2 |
| Sheep | 223 | 2 |
Most of the DE data is coming from feediepdia which is expressed as a % of DE so we can’t convert that data back using the IPCC method.
# # Define digestibility conversion factor (Song et al., 2025)
# default_DE_fraction <- 0.54 # 54%
#
# # 1. Extract DE rows from merged_metadata
# de_data <- merged_metadata$Animals.Diet.Digest %>%
# filter(DD.Variable == "DE", !is.na(DD.Value) & DD.Value != "") %>%
# mutate(
# DD.Value = as.numeric(DD.Value),
# DD.Unit = trimws(tolower(DD.Unit)),
# DD.Value = case_when(
# DD.Unit %in% c("mj/kg", "mj/kg dm") ~ DD.Value,
# DD.Unit == "mcal/kg" ~ DD.Value * 4.184,
# DD.Unit == "kcal/kg" ~ DD.Value / 239,
# DD.Unit == "kcal/kg dm" ~ DD.Value / 239,
# DD.Unit == "kcal/100g" ~ (DD.Value * 10) / 239,
# DD.Unit == "kj/g" ~ DD.Value / 1000,
# DD.Unit == "kj/100g" ~ DD.Value / 100,
# TRUE ~ NA_real_
# ),
# DD.Unit = "MJ/kg"
# ) %>%
# mutate(
# GE_from_DE = DD.Value / default_DE_fraction
# ) %>%
# select(B.Code, D.Item, GE_from_DE)
#
# # 2. Merge GE_from_DE into your existing Gross_energy dataset
# Gross_energy <- Gross_energy %>%
# left_join(de_data, by = c("B.Code", "D.Item")) %>%
# mutate(
# # Flag only when GE was previously missing and DE is available
# GE_source = case_when(
# is.na(GE_source) & is.na(DC.Value) & !is.na(GE_from_DE) ~ "inferred_from_DE",
# TRUE ~ GE_source
# ),
# # Fill in missing GE values with GE_from_DE
# DC.Value = if_else(is.na(DC.Value), GE_from_DE, DC.Value),
# DC.Unit = "MJ/kg",
# DC.Variable = "GE"
# ) %>%
# select(-GE_from_DE)
# 1. Total studies per species
species_total <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product)
# 2. GE: Only B.Codes where DC.Variable == GE and DC.Value is NOT NA
ge_bcodes <- Gross_energy %>%
filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
distinct(B.Code) %>%
pull()
# 3. Feed: Only B.Codes where D.Amount is NOT NA
feed_amount_bcodes <- ingredients %>%
filter(!is.na(D.Amount)) %>%
distinct(B.Code) %>%
pull()
# 4. B.Codes with BOTH GE value AND feed amount
both_bcodes <- intersect(ge_bcodes, feed_amount_bcodes)
# 5. Build final summary table by species
final_summary <- species_total %>%
mutate(
Has_GE = B.Code %in% ge_bcodes,
Has_FEED_or_Amount = B.Code %in% feed_amount_bcodes,
Has_BOTH = B.Code %in% both_bcodes
) %>%
group_by(P.Product) %>%
summarise(
Total_Studies = n_distinct(B.Code),
Studies_with_GE = sum(Has_GE),
Studies_with_FEED_or_Amount = sum(Has_FEED_or_Amount),
Studies_with_BOTH = sum(Has_BOTH)
) %>%
arrange(P.Product)
# Show summary
knitr::kable(final_summary, caption = "Final Summary (with actual GE and Feed Amount values)")
| P.Product | Total_Studies | Studies_with_GE | Studies_with_FEED_or_Amount | Studies_with_BOTH |
|---|---|---|---|---|
| Cattle | 126 | 120 | 29 | 29 |
| Goat | 186 | 182 | 34 | 34 |
| Sheep | 223 | 221 | 55 | 55 |
# Optional: List the actual B.Codes with both
bcode_list_both <- species_total %>%
filter(B.Code %in% both_bcodes) %>%
arrange(P.Product, B.Code)
# Show the B.Codes table
knitr::kable(bcode_list_both, caption = "B.Codes with BOTH GE (DC.Value) and Feed Intake (D.Amount)")
| B.Code | P.Product |
|---|---|
| BO1023 | Cattle |
| BO1028 | Cattle |
| BO1033 | Cattle |
| BO1038 | Cattle |
| CJ1006 | Cattle |
| EM1050 | Cattle |
| EM1093 | Cattle |
| EO0041 | Cattle |
| HK0086 | Cattle |
| HK0138 | Cattle |
| JS0201.1 | Cattle |
| JS0201.2 | Cattle |
| JS0205 | Cattle |
| JS0215.1 | Cattle |
| JS0215.2 | Cattle |
| JS0324.1 | Cattle |
| JS0324.2 | Cattle |
| JS0345 | Cattle |
| LM0101 | Cattle |
| LM0124 | Cattle |
| LM0127 | Cattle |
| LM0227.1 | Cattle |
| LM0227.2 | Cattle |
| LM0228.2 | Cattle |
| NJ0016 | Cattle |
| NJ0025 | Cattle |
| NN0207.2 | Cattle |
| NN0227 | Cattle |
| NN0280.2 | Cattle |
| AG0042 | Goat |
| BO1015 | Goat |
| BO1042 | Goat |
| BO1076 | Goat |
| BO1098 | Goat |
| BO1099 | Goat |
| DK0037 | Goat |
| EM1011 | Goat |
| EM1035 | Goat |
| EO0032 | Goat |
| EO0057 | Goat |
| HK0107 | Goat |
| HK0171 | Goat |
| HK0182 | Goat |
| HK0337 | Goat |
| HK0350 | Goat |
| JS0172 | Goat |
| JS0192 | Goat |
| JS0193 | Goat |
| JS0196 | Goat |
| JS0197 | Goat |
| JS0208 | Goat |
| JS0211 | Goat |
| JS0231 | Goat |
| JS0261 | Goat |
| LM0248 | Goat |
| LM0280 | Goat |
| LM0351 | Goat |
| NJ0013 | Goat |
| NN0083 | Goat |
| NN0092.2 | Goat |
| NN0229 | Goat |
| NN0374 | Goat |
| NN0375 | Goat |
| 0 | Sheep |
| AN0007 | Sheep |
| AN0021 | Sheep |
| AN0029 | Sheep |
| BO1008 | Sheep |
| BO1025 | Sheep |
| BO1034 | Sheep |
| BO1041 | Sheep |
| BO1048 | Sheep |
| BO1050 | Sheep |
| BO1051 | Sheep |
| BO1059 | Sheep |
| CJ1004 | Sheep |
| CJ1015 | Sheep |
| DK0007 | Sheep |
| EM1023 | Sheep |
| EM1031 | Sheep |
| EM1033 | Sheep |
| EM1034 | Sheep |
| EM1082 | Sheep |
| EM1085 | Sheep |
| EM1090 | Sheep |
| EO0046 | Sheep |
| HK0033 | Sheep |
| HK0082 | Sheep |
| JO1028 | Sheep |
| JO1036 | Sheep |
| JO1038 | Sheep |
| JO1082 | Sheep |
| JS0210.1 | Sheep |
| JS0210.2 | Sheep |
| JS0249 | Sheep |
| JS0262 | Sheep |
| JS0273 | Sheep |
| JS0276 | Sheep |
| JS0285 | Sheep |
| JS0291 | Sheep |
| JS0301 | Sheep |
| JS0316 | Sheep |
| JS0344 | Sheep |
| JS0350 | Sheep |
| JS0419 | Sheep |
| LM0298 | Sheep |
| NJ0008 | Sheep |
| NJ0032 | Sheep |
| NJ0044 | Sheep |
| NN0081 | Sheep |
| NN0083 | Sheep |
| NN0180.2 | Sheep |
| NN0200 | Sheep |
| NN0207.1 | Sheep |
| NN0212 | Sheep |
| NN0234 | Sheep |
| NN0329 | Sheep |
| NN0370 | Sheep |
Gross_energy <- Gross_energy %>%
filter(!str_starts(D.Item, "Base"))
missing_GE_items <- Gross_energy %>% # replace with your dataframe name
filter(DC.Variable == "GE", is.na(DC.Value)) %>%
count(D.Item, sort = TRUE, name = "n_missing")
To estimate enteric methane emissions, a methane conversion factor (Ym) is applied to the gross energy (GE) intake of livestock. Ym represents the proportion of the animal’s gross energy intake that is converted to methane and varies based on the species, productivity level, and diet composition. The IPCC (2019) provides updated Ym values based on feed digestibility and forage content. Below is the logic applied:
kable(
data.frame(
Category = c("Dairy Cattle (High)", "Dairy Cattle (Medium)", "Dairy Cattle (Low)", "Dairy Cattle (Unknown)",
"Meat Cattle (>75% forage)", "Meat Cattle (15–75% forage)", "Meat Cattle (<15% forage)", "Meat Cattle (Unknown)",
"Sheep", "Lambs", "Goats"),
Ym_Percent = c(5.7, 6.3, 6.5, 6.0, 7.0, 6.3, 4.0, 6.5, 6.7, 4.5, 5.5),
Condition = c("> 8500 kg milk/year", "5000–8500 kg milk/year", "< 5000 kg milk/year", "Unknown milk yield",
"Forage > 75%", "Forage 15–75%", "Forage < 15%", "Unknown forage %",
"Default for Sheep", "If 'lamb' in Herd.Stage", "Default for Goats")
),
caption = "Assigned Ym Values Based on IPCC 2019 Guidelines and Available Data"
)
| Category | Ym_Percent | Condition |
|---|---|---|
| Dairy Cattle (High) | 5.7 | > 8500 kg milk/year |
| Dairy Cattle (Medium) | 6.3 | 5000–8500 kg milk/year |
| Dairy Cattle (Low) | 6.5 | < 5000 kg milk/year |
| Dairy Cattle (Unknown) | 6.0 | Unknown milk yield |
| Meat Cattle (>75% forage) | 7.0 | Forage > 75% |
| Meat Cattle (15–75% forage) | 6.3 | Forage 15–75% |
| Meat Cattle (<15% forage) | 4.0 | Forage < 15% |
| Meat Cattle (Unknown) | 6.5 | Unknown forage % |
| Sheep | 6.7 | Default for Sheep |
| Lambs | 4.5 | If ‘lamb’ in Herd.Stage |
| Goats | 5.5 | Default for Goats |
library(dplyr)
library(stringr)
# ── Catch ALL meat-yield outcomes (final BW, carcass, slaughter body, etc.) ──
meat_regex <- paste0(
"(?i)", # case-insensitive
"\\bmeat\\s*yield\\b", # "Meat Yield-..."
"|\\bfinal\\s*body\\s*weight\\s*meat\\s*yield\\b", # reversed wording
"|\\bslaughter\\s*body\\b",
"|\\bcarcass\\b" # hot/cold/empty carcass
)
# 1) Filter yield rows: Milk + ALL meat outcomes
yield_data <- merged_metadata$Data.Out %>%
filter(
Out.Subind == "Milk Yield" |
str_detect(Out.Subind, meat_regex)
)
# 2) Normalize units to a small canonical set (same behavior as before)
yield_data <- yield_data %>%
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
Out.Unit = case_when(
Out.Unit %in% c("kg","kg/d","kg/day","kg/day/individual","kg/individual/day",
"kg/d/individual","kg/individual") ~ "kg/individual/day",
Out.Unit %in% c("g","g/d","g/day","g/day/individual","g/d/individual",
"g/individual/day","g/individual") ~ "g/individual/day",
Out.Unit %in% c("mg/d/individual") ~ "mg/individual/day",
Out.Unit %in% c("l/d","l/day","l/d/individual","l/individual/day","l/individual",
"l/individual/day","l/individual") ~ "l/individual/day",
Out.Unit %in% c("ml/d/individual","ml/individual","ml/individual/day") ~ "ml/individual/day",
Out.Unit == "g/kg/individual" ~ "g/kg/individual",
Out.Unit == "kg/replicate/experiment" ~ "kg/replicate/experiment",
TRUE ~ Out.Unit
)
) %>%
mutate(
Out.Unit = tolower(trimws(Out.Unit)),
ED.Mean.T = case_when(
Out.Unit == "g/individual/day" ~ ED.Mean.T / 1000, # g → kg
Out.Unit == "mg/individual/day" ~ ED.Mean.T / 1e6, # mg → kg
Out.Unit == "ml/individual/day" ~ ED.Mean.T / 1000, # mL → L
TRUE ~ ED.Mean.T
),
Out.Unit = case_when(
Out.Unit %in% c("kg/individual/day","g/individual/day","mg/individual/day") ~ "kg/individual/day",
Out.Unit %in% c("l/individual/day","ml/individual/day") ~ "l/individual/day",
TRUE ~ NA_character_
)
) %>%
# Keep only acceptable units (same as before)
filter(Out.Unit %in% c("kg/individual/day","l/individual/day"))
# 3) Annual milk yield per B.Code (unchanged)
milk_yield <- yield_data %>%
filter(Out.Subind == "Milk Yield", Out.Unit == "kg/individual/day") %>%
group_by(B.Code) %>%
summarise(milk_kg_year = mean(ED.Mean.T, na.rm = TRUE) * 365, .groups = "drop")
# 4) Purpose per B.Code, now recognizing ALL meat outcomes; Dairy still wins
purpose_by_code <- merged_metadata$Data.Out %>%
mutate(
is_milk = Out.Subind == "Milk Yield",
is_meat = str_detect(Out.Subind, meat_regex)
) %>%
filter(is_milk | is_meat) %>%
mutate(
Purpose = case_when(
is_milk ~ "Dairy",
is_meat ~ "Meat",
TRUE ~ NA_character_
)
) %>%
group_by(B.Code) %>%
summarise(Purpose = if ("Dairy" %in% Purpose) "Dairy" else first(Purpose),
.groups = "drop")
# 5) (unchanged) Harmonize ingredient amounts to kg and proceed with your pipeline
ingredients <- ingredients %>%
mutate(
D.Amount = ifelse(D.Unit.Amount == "g", D.Amount / 1000, D.Amount),
D.Unit.Amount = ifelse(D.Unit.Amount == "g", "kg", D.Unit.Amount)
)
diet_valid_units <- ingredients %>%
group_by(B.Code, A.Level.Name) %>%
filter(n_distinct(D.Unit.Amount, na.rm = TRUE) == 1) %>%
ungroup()
forage_types <- c("Forage Trees","Crop Byproduct","Herbaceous Fodders","Agroforestry Fodders")
forage_proportions <- diet_valid_units %>%
filter(D.Type != "Entire Diet") %>%
group_by(B.Code, A.Level.Name) %>%
summarise(
any_na = any(is.na(D.Amount)),
total_diet = ifelse(any_na, NA_real_, sum(D.Amount, na.rm = TRUE)),
total_forage = ifelse(any_na, NA_real_, sum(D.Amount[D.Type %in% forage_types], na.rm = TRUE)),
n_ingredients = n(),
is_forage = all(D.Type[1] %in% forage_types),
D.Type.first = D.Type[1],
D.Unit.Amount = first(D.Unit.Amount),
forage_prop = case_when(
n_ingredients == 1 & D.Type.first %in% forage_types ~ 1,
n_ingredients == 1 & !(D.Type.first %in% forage_types) ~ 0,
total_diet > 0 ~ total_forage / total_diet,
TRUE ~ NA_real_
),
.groups = "drop"
)
#
# ingredients <- ingredients %>% # your table
# # 1 ── throw away the old “source‑based” label -----------------------------
# select(-D.Type) %>% # drop the column
#
# # 2 ── remove any diet that still has an NA in D.Type_Nutri ----------------
# group_by(B.Code, T.Name) %>% # <<– keys that identify a single diet
# filter( all(!is.na(D.Type_Nutri)) ) %>% # keep diet only if none are NA
# ungroup()
Reconstructs the gross energy of the entire diet from the weighted average of GE values of individual ingredients, when diet-level GE is not reported.
# --- Calculate GE_diet (MJ/kg) from ingredient GE and amount ---
# 1. Join GE values to ingredient amounts
# --- Calculate GE_diet (MJ/kg) ONLY if all ingredients have GE ---
# 1. Join GE values to ingredient amounts
# Collapse Gross_energy to mean DC.Value per D.Item
gross_energy_mean <- Gross_energy %>%
filter(!is_entire_diet) %>%
group_by(D.Item) %>%
summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop")
# Join by D.Item only
ingredients_with_GE <- ingredients %>%
mutate(Diet_ID = paste(B.Code, A.Level.Name, sep = "__")) %>%
left_join(gross_energy_mean, by = "D.Item") %>%
mutate(
D.Amount_kg = case_when(
D.Unit.Amount == "g" ~ D.Amount / 1000,
D.Unit.Amount == "kg" ~ D.Amount,
TRUE ~ NA_real_
)
)
# 2. Count ingredients with/without GE
ingredient_counts <- ingredients_with_GE %>%
group_by(Diet_ID) %>%
summarise(
n_total = n(),
n_with_GE = sum(!is.na(DC.Value)),
.groups = "drop"
) %>%
filter(n_total == n_with_GE)
# 3. Reconstruct GE_diet (MJ/kg)
GE_from_ingredients <- ingredients_with_GE %>%
semi_join(ingredient_counts, by = "Diet_ID") %>%
group_by(Diet_ID, B.Code, A.Level.Name) %>%
summarise(
GE_diet = sum(DC.Value * D.Amount_kg) / sum(D.Amount_kg),
.groups = "drop"
)
# 4. Use GE_diet when full diet-level GE is missing
# Join forage proportions into GE_diet_reconstructed
GE_diet_reconstructed <- GE_from_ingredients %>%
left_join(total_amounts %>%
select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
by = c("B.Code", "A.Level.Name")) %>%
left_join(merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage), by = "B.Code") %>%
left_join(purpose_by_code, by = "B.Code") %>%
left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop),
by = c("B.Code", "A.Level.Name")) %>%
left_join(milk_yield, by = "B.Code") %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
mutate(
Total_Amount_kg = case_when(
D.Unit.Amount == "g" ~ Total_Amount / 1000,
D.Unit.Amount == "kg" ~ Total_Amount,
TRUE ~ NA_real_
),
D.Unit.Amount = "kg", # <- FORCE ALL TO BE KG
Ym = case_when(
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year) ~ 0.06,
P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,
P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
P.Product == "Sheep" ~ 0.067,
P.Product == "Goat" ~ 0.055,
TRUE ~ NA_real_
),
GE_daily = ifelse(!is.na(GE_diet) & !is.na(Total_Amount_kg),
GE_diet * Total_Amount_kg,
NA_real_),
CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
GE_daily * Ym * 365 / 55.65,
NA_real_),
CH4_flag = case_when(
is.na(GE_diet) ~ "missing_GE_diet",
is.na(Total_Amount_kg) ~ "missing_amount",
is.na(Ym) ~ "missing_Ym",
is.na(GE_daily) ~ "GE_not_calculated",
is.na(CH4_kg_year) ~ "CH4_not_calculated",
TRUE ~ "GE_diet_based"
),
source = "GE_diet_from_ingredients"
) %>%
rename(D.Item = A.Level.Name) %>%
select(B.Code, D.Item, P.Product, Herd.Stage, GE_daily, CH4_kg_year,
Total_Amount = Total_Amount_kg, D.Unit.Amount, ED.Mean.T = Total_Amount_kg,
CH4_flag, Ym, source)
Gross_energy_entire <- Gross_energy %>%
filter(is_entire_diet == TRUE) %>%
# Add feed amount and unit
left_join(
total_amounts %>% select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
by = c("B.Code" = "B.Code", "D.Item" = "A.Level.Name")
) %>%
# Add product info and herd stage
left_join(
merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage),
by = "B.Code"
) %>%
# Add dairy vs meat purpose
left_join(purpose_by_code, by = "B.Code") %>%
mutate(
# Convert Total_Amount to kg/day
Total_Amount = case_when(
D.Unit.Amount == "g" ~ Total_Amount / 1000,
D.Unit.Amount == "kg" ~ Total_Amount,
TRUE ~ NA_real_
),
# Fix unit
D.Unit.Amount = case_when(
D.Unit.Amount == "g" ~ "kg",
TRUE ~ D.Unit.Amount
),
# Diagnostic reason for failure
reason_missing = case_when(
is.na(Total_Amount) & D.Unit.Amount %in% c("g/kg", "%", "% Diet") ~ "Cannot convert unit to kg",
TRUE ~ NA_character_
),
# Gross energy intake
GE_daily = ifelse(
!is.na(Total_Amount) & !is.na(DC.Value),
DC.Value * Total_Amount,
NA_real_
)
)
# Make sure 'purpose_by_code' is unique per B.Code
purpose_by_code_unique <- purpose_by_code %>%
distinct(B.Code, Purpose)
# Then join safely before using 'Purpose'
GE_all <- Gross_energy_entire %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop),
by = c("B.Code", "D.Item" = "A.Level.Name")) %>%
left_join(milk_yield, by = "B.Code") %>%
left_join(
feed_intake_diet %>%
select(B.Code, A.Level.Name, diet_intake_kg),
by = c("B.Code", "D.Item" = "A.Level.Name")
) %>%
mutate(
Total_Amount = if_else(!is.na(diet_intake_kg), diet_intake_kg, Total_Amount),
D.Unit.Amount = if_else(!is.na(diet_intake_kg), "kg", D.Unit.Amount),
Ym = case_when(
# --- Dairy cattle based on productivity ---
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year) ~ 0.06,
# --- Meat cattle based on forage proportions ---
P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,
# --- Sheep & Lambs ---
P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
P.Product == "Sheep" ~ 0.067,
# --- Goats ---
P.Product == "Goat" ~ 0.055,
TRUE ~ NA_real_
),
GE_daily = ifelse(!is.na(DC.Value) & !is.na(Total_Amount),
DC.Value * Total_Amount,
NA_real_),
CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
GE_daily * Ym * 365 / 55.65,
NA_real_),
CH4_flag = case_when(
is.na(DC.Value) ~ "missing_GE_value",
is.na(Ym) ~ "missing_Ym",
is.na(Total_Amount) ~ "missing_amount",
is.na(GE_daily) ~ "GE_not_calculated",
is.na(CH4_kg_year) ~ "CH4_not_calculated",
!is.na(diet_intake_kg) ~ "amount_inferred from feed intake",
TRUE ~ "ok"
)
)
# ──────────────────────────────────────────────────────────────────────────────
# 0) Keep only ingredient GE (not entire diet) and choose ONE value per item
# using a priority order of sources you already created upstream.
# Adjust the order if you prefer a different hierarchy.
# ──────────────────────────────────────────────────────────────────────────────
ge_priority <- c(
"raw data",
"feedipedia",
"predicted_from_Weiss_GE",
"predicted_from_Weiss_DE",
"inferred_from_same_ingredient-Step2",
"inferred_from_same_ingredient",
"assumed_zero"
)
GE_ingredients_best <- Gross_energy %>%
filter(is_entire_diet == FALSE, DC.Variable == "GE") %>%
mutate(
# normalize source label text
GE_source = str_squish(tolower(GE_source)),
# ensure unit is MJ/kg
DC.Unit = "MJ/kg",
# rank by source priority
src_rank = match(GE_source, ge_priority)
) %>%
arrange(B.Code, D.Item, src_rank, desc(!is.na(DC.Value))) %>%
group_by(B.Code, D.Item) %>%
# keep the single best row per (B.Code, D.Item)
slice_head(n = 1) %>%
ungroup() %>%
transmute(
B.Code,
D.Item,
GE_MJ_per_kg = as.numeric(DC.Value),
GE_source
)
# ──────────────────────────────────────────────────────────────────────────────
# 1) Ingredients only + attach GE + compute per-ingredient GE intake (MJ/day)
# NOTE: we keep your existing columns intact and just append new ones.
# ──────────────────────────────────────────────────────────────────────────────
ingredients_enriched <- ingredients %>%
# enforce "ingredients only" (i.e., exclude entire-diet pseudo-rows if present)
filter(D.Type != "Entire Diet") %>%
# join GE per (B.Code, D.Item)
left_join(GE_ingredients_best, by = c("B.Code", "D.Item")) %>%
# convert amounts to kg/day (where possible)
mutate(
D.Unit.Amount = str_squish(tolower(D.Unit.Amount)),
Amount_kg_day = case_when(
D.Unit.Amount == "kg" ~ D.Amount,
D.Unit.Amount == "g" ~ D.Amount / 1000,
TRUE ~ NA_real_ # unsupported units remain NA
),
# GE intake contributed by this ingredient (MJ/day)
GE_MJ_day = if_else(!is.na(GE_MJ_per_kg) & !is.na(Amount_kg_day),
GE_MJ_per_kg * Amount_kg_day,
NA_real_)
)
# ──────────────────────────────────────────────────────────────────────────────
# 2) (Optional) Quick diagnostics
# ──────────────────────────────────────────────────────────────────────────────
n_total_ing <- nrow(ingredients_enriched)
n_with_GE <- sum(!is.na(ingredients_enriched$GE_MJ_per_kg))
n_with_GEint <- sum(!is.na(ingredients_enriched$GE_MJ_day))
message("Ingredients total: ", n_total_ing,
" | with GE (MJ/kg): ", n_with_GE,
" | with GE intake (MJ/day): ", n_with_GEint)
## Ingredients total: 2879 | with GE (MJ/kg): 2521 | with GE intake (MJ/day): 2157
# If you want to keep only your original columns + added GE columns in a view:
ingredients_view <- ingredients_enriched %>%
select(
B.Code, A.Level.Name, D.Item, D.Type, D.Amount, D.Unit.Amount, D.Unit.Time,
D.Unit.Animals, DC.Is.Dry, D.Ad.lib, D.Item.Group, Source, Units,
T.Animals, Out.WG.Start, Out.WG.Unit, basal_intake_status,
Prediction_Source, Prediction_Source.from_AName, from_feed_intake,
T.Name, T.Control,
# New appended columns:
GE_MJ_per_kg, GE_source, Amount_kg_day, GE_MJ_day
)
###############################################################################
## SINGLE CHUNK: Sum Ingredient Intake & Merge into Entire-Diet GE + CH₄ Calc
###############################################################################
# 1) Sum ingredient-level feed intake ----------------------------------------
ingredient_intake_summed <- feed_intake_ingredients %>%
dplyr::group_by(B.Code, A.Level.Name) %>%
dplyr::summarise(
sum_ingredient_intake = sum(ED.Mean.T, na.rm = TRUE),
.groups = "drop"
)
# 2) Merge sums into entire-diet GE table, then compute intake, GE, and CH₄ --
GE_all_reconstructed <- Gross_energy_entire %>%
# Avoid .x/.y suffixes from prior columns
dplyr::select(-dplyr::any_of(c("Purpose",
"forage_prop",
"diet_intake_kg",
"sum_ingredient_intake",
"Total_Amount_kg"))) %>%
# Purpose (unique per paper)
dplyr::left_join(
purpose_by_code %>% dplyr::distinct(B.Code, Purpose),
by = "B.Code"
) %>%
# Forage proportion (match diet name to D.Item)
dplyr::left_join(
forage_proportions %>% dplyr::select(B.Code, A.Level.Name, forage_prop),
by = c("B.Code", "D.Item" = "A.Level.Name")
) %>%
# Milk yield (for dairy productivity bins)
dplyr::left_join(milk_yield, by = "B.Code") %>%
# Only ruminants
dplyr::filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
# Entire-diet intake if available (kg DM d-1)
dplyr::left_join(
feed_intake_diet %>% dplyr::select(B.Code, A.Level.Name, diet_intake_kg),
by = c("B.Code", "D.Item" = "A.Level.Name")
) %>%
# Sum of ingredient-level intakes (kg DM d-1)
dplyr::left_join(
ingredient_intake_summed,
by = c("B.Code", "D.Item" = "A.Level.Name")
) %>%
# Compute CH4 fields ---------------------------------------------------------
dplyr::mutate(
# Standardize any lingering total-amount unit to kg
Total_Amount_kg = dplyr::case_when(
D.Unit.Amount == "kg" ~ Total_Amount,
D.Unit.Amount == "g" ~ Total_Amount / 1000,
TRUE ~ NA_real_
),
# Ym by species/purpose/forage (IPCC-style rules)
Ym = dplyr::case_when(
# Dairy cattle by productivity
P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year > 8500 ~ 0.057,
P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year >= 5000 ~ 0.063,
P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year < 5000 ~ 0.065,
P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year) ~ 0.060,
# Meat cattle by forage proportion
P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop > 0.75 ~ 0.070,
P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop > 0.15 ~ 0.063,
P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop >= 0 ~ 0.040,
P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,
# Sheep & lambs
P.Product == "Sheep" & stringr::str_detect(dplyr::coalesce(Herd.Stage, ""), "(?i)lamb") ~ 0.045,
P.Product == "Sheep" ~ 0.067,
# Goats
P.Product == "Goat" ~ 0.055,
TRUE ~ NA_real_
),
# Final daily intake (kg/day): prefer entire-diet intake, then total amount, then sum of ingredients
final_intake_kg_day = dplyr::case_when(
!is.na(diet_intake_kg) ~ diet_intake_kg,
!is.na(Total_Amount_kg) ~ Total_Amount_kg,
!is.na(sum_ingredient_intake) ~ sum_ingredient_intake,
TRUE ~ NA_real_
),
# Track source of intake
final_intake_source = dplyr::case_when(
!is.na(diet_intake_kg) ~ "Entire diet feed intake",
!is.na(Total_Amount_kg) ~ "Total_Amount",
!is.na(sum_ingredient_intake) ~ "Total feed intake (sum of ingredients)",
TRUE ~ "Missing"
),
# Daily GE intake (MJ/day) = GE content × daily intake
GE_daily = dplyr::case_when(
!is.na(final_intake_kg_day) & !is.na(DC.Value) ~ DC.Value * final_intake_kg_day,
TRUE ~ NA_real_
),
# Annual CH4 emissions (kg/yr)
CH4_kg_year = dplyr::if_else(
!is.na(GE_daily) & !is.na(Ym),
GE_daily * Ym * 365 / 55.65,
NA_real_
),
# Diagnostics
CH4_flag = dplyr::case_when(
is.na(DC.Value) ~ "missing_GE_value",
is.na(Ym) ~ "missing_Ym",
is.na(final_intake_kg_day) ~ "missing_final_intake",
is.na(GE_daily) ~ "GE_not_calculated",
is.na(CH4_kg_year) ~ "CH4_not_calculated",
TRUE ~ "ok"
)
)
#combine
# 1. Tag and select relevant columns from both tables
GE_all_clean <- GE_all %>%
mutate(source = "GE_all") %>%
select(
B.Code,
D.Item,
P.Product,
Herd.Stage,
GE_daily,
CH4_kg_year,
Total_Amount,
D.Unit.Amount,
diet_intake_kg,
CH4_flag,
Ym,
source
)
GE_reconstructed_clean <- GE_all_reconstructed %>%
mutate(source = "Reconstructed") %>%
select(
B.Code,
D.Item,
P.Product,
Herd.Stage,
GE_daily,
CH4_kg_year,
Total_Amount,
D.Unit.Amount,
final_intake_kg_day,
CH4_flag,
Ym,
source
)
GE_combined <- bind_rows(GE_all_clean, GE_reconstructed_clean, GE_diet_reconstructed) %>%
mutate(
source_priority = case_when(
source == "GE_all" ~ 1,
source == "Reconstructed" ~ 2,
source == "GE_diet_from_ingredients" ~ 3,
TRUE ~ 4
),
has_CH4 = !is.na(CH4_kg_year)
) %>%
arrange(B.Code, D.Item, desc(has_CH4), source_priority) %>%
distinct(B.Code, D.Item, .keep_all = TRUE) %>%
select(-has_CH4, -source_priority)
# Define non-emitting or pre-ruminant stages
non_emitting_stages <- c(
"Kid", "Kid Male", "Calf Female", "Weaned Calf Male",
"Young Male", "Young Female",
"Intact Kid Male"
)
# Filter them out
GE_combined <- GE_combined %>%
filter(!(Herd.Stage %in% non_emitting_stages))
GE_combined <- GE_combined %>%
left_join(ctrl_key, by = c("B.Code", "D.Item" = "A.Level.Name"))
GE_data <- GE_combined %>%
filter(!is.na(GE_daily) & GE_daily != "") %>%
distinct(B.Code)
# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 32 |
| Goat | 186 | 67 |
| Sheep | 223 | 68 |
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
distinct(B.Code) %>%
nrow()
n_CH4_codes
## [1] 162
n_CH4_codes_by_product <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "n_CH4_codes")
n_CH4_codes_by_product
## P.Product n_CH4_codes
## <char> <int>
## 1: Cattle 28
## 2: Goat 66
## 3: Sheep 68
# 1. Keep only codes with CH4
GE_with_CH4 <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
filter(CH4_kg_year<500)%>%
select(B.Code, D.Item, P.Product, CH4_kg_year)
# 2. Variability of CH4 within each code
ch4_variability <- GE_with_CH4 %>%
group_by(B.Code, P.Product) %>%
summarise(
n_diets = n_distinct(D.Item),
min_CH4 = min(CH4_kg_year, na.rm = TRUE),
max_CH4 = max(CH4_kg_year, na.rm = TRUE),
mean_CH4 = mean(CH4_kg_year, na.rm = TRUE),
sd_CH4 = sd(CH4_kg_year, na.rm = TRUE),
range_CH4 = max_CH4 - min_CH4,
.groups = "drop"
) %>%
filter(n_diets > 1) # only studies with >1 diet
# 3. Summary across products (Cattle, Sheep, Goat)
ch4_variability_summary <- ch4_variability %>%
group_by(P.Product) %>%
summarise(
n_studies = n(),
avg_range = mean(range_CH4, na.rm = TRUE),
avg_sd = mean(sd_CH4, na.rm = TRUE),
max_within_diff = max(range_CH4, na.rm = TRUE),
.groups = "drop"
)
ggplot(ch4_variability, aes(x = reorder(B.Code, range_CH4),
ymin = min_CH4, ymax = max_CH4,
color = P.Product)) +
geom_linerange(size = 1) +
geom_point(aes(y = min_CH4), shape = 21, fill = "white", size = 2) +
geom_point(aes(y = max_CH4), shape = 21, fill = "black", size = 2) +
coord_flip() +
labs(
x = "Study (B.Code)",
y = "CH₄ emissions (kg/year)",
title = "Within-study range of CH₄ emissions across diets"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ch4_variability_stats <- GE_with_CH4 %>%
filter(CH4_kg_year <= 500) %>% # remove outliers
group_by(B.Code, P.Product) %>%
summarise(
n_diets = n(),
mean_CH4 = mean(CH4_kg_year, na.rm = TRUE),
sd_CH4 = sd(CH4_kg_year, na.rm = TRUE), # standard deviation
iqr_CH4 = IQR(CH4_kg_year, na.rm = TRUE), # interquartile range
cv_CH4 = sd(CH4_kg_year, na.rm = TRUE) / mean(CH4_kg_year, na.rm = TRUE), # coefficient of variation
.groups = "drop"
)
ggplot(ch4_variability_stats, aes(x = P.Product, y = cv_CH4, fill = P.Product)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(
x = "Species",
y = "CH₄ coefficient of variation",
title = "Relative variability of CH₄ emissions within studies"
) +
theme_minimal()
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
# 1. Keep only codes with CH4
GE_with_CH4 <- GE_combined %>%
filter(!is.na(GE_daily)) %>%
filter(CH4_kg_year<500)%>%
select(B.Code, D.Item, P.Product, GE_daily)
# 2. Variability of CH4 within each code
ch4_variability <- GE_with_CH4 %>%
group_by(B.Code, P.Product) %>%
summarise(
n_diets = n_distinct(D.Item),
min_CH4 = min(GE_daily, na.rm = TRUE),
max_CH4 = max(GE_daily, na.rm = TRUE),
mean_CH4 = mean(GE_daily, na.rm = TRUE),
sd_CH4 = sd(GE_daily, na.rm = TRUE),
range_CH4 = max_CH4 - min_CH4,
.groups = "drop"
) %>%
filter(n_diets > 1) # only studies with >1 diet
# 3. Summary across products (Cattle, Sheep, Goat)
ch4_variability_summary <- ch4_variability %>%
group_by(P.Product) %>%
summarise(
n_studies = n(),
avg_range = mean(range_CH4, na.rm = TRUE),
avg_sd = mean(sd_CH4, na.rm = TRUE),
max_within_diff = max(range_CH4, na.rm = TRUE),
.groups = "drop"
)
ch4_variability_stats <- GE_with_CH4 %>%
filter(GE_daily <= 500) %>% # remove outliers
group_by(B.Code, P.Product) %>%
summarise(
n_diets = n(),
mean_CH4 = mean(GE_daily, na.rm = TRUE),
sd_CH4 = sd(GE_daily, na.rm = TRUE), # standard deviation
iqr_CH4 = IQR(GE_daily, na.rm = TRUE), # interquartile range
cv_CH4 = sd(GE_daily, na.rm = TRUE) / mean(GE_daily, na.rm = TRUE), # coefficient of variation
.groups = "drop"
)
ggplot(ch4_variability_stats, aes(x = P.Product, y = cv_CH4, fill = P.Product)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(
x = "Species",
y = "GE coefficient of variation",
title = "Relative variability of GE within studies"
) +
theme_minimal()
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Step 1: Get list of B.Codes from GE_combined
relevant_BCodes <- GE_combined %>%
distinct(B.Code)
# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
semi_join(relevant_BCodes, by = "B.Code")
# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
filter(!is.na(A.Level.Name)) %>%
distinct(B.Code, A.Level.Name) %>%
nrow()
total_ingredients <- diet_data_filtered %>%
filter(!is.na(D.Item)) %>%
distinct(B.Code, D.Item) %>%
nrow()
# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 1317
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 2457
total number of codes that have ge and have both treatment and control
# Identify B.Codes that contain both control and treatment diets
b_codes_with_both_ctrl_and_trt <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
filter(T.Control %in% c("yes", "no")) %>% # Only if T.Control is properly labeled
group_by(B.Code) %>%
summarise(n_control_status = n_distinct(T.Control), .groups = "drop") %>%
filter(n_control_status > 1) # Keep only those with both Yes and No
# Count of such B.Codes
n_both_ctrl_and_trt <- nrow(b_codes_with_both_ctrl_and_trt)
cat("Number of B.Codes with both control and treatment diets:", n_both_ctrl_and_trt, "\n")
## Number of B.Codes with both control and treatment diets: 100
papers that have no tagg for t control
check_control<-ingredients%>%
filter(is.na(T.Control))%>%
select(B.Code,A.Level.Name,D.Item,T.Control)
filter ge combined with only papers with both control and treatments
GE_combined <- GE_combined %>%
semi_join(b_codes_with_both_ctrl_and_trt, by = "B.Code")
GE_species <- merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
semi_join(GE_data, by = "B.Code") %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "Studies_with_GE")
summary_table <- species_counts %>%
full_join(GE_species, by = "P.Product") %>%
mutate(across(everything(), ~replace_na(., 0)))
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
| P.Product | Total_Studies | Studies_with_GE |
|---|---|---|
| Cattle | 126 | 32 |
| Goat | 186 | 67 |
| Sheep | 223 | 68 |
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
distinct(B.Code) %>%
nrow()
n_CH4_codes
## [1] 100
n_CH4_codes_by_product <- GE_combined %>%
filter(!is.na(CH4_kg_year)) %>%
distinct(B.Code, P.Product) %>%
count(P.Product, name = "n_CH4_codes")
n_CH4_codes_by_product
## P.Product n_CH4_codes
## <char> <int>
## 1: Cattle 19
## 2: Goat 44
## 3: Sheep 37
relevant_BCodes <- GE_combined %>%
distinct(B.Code)
# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
semi_join(relevant_BCodes, by = "B.Code")
# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
filter(!is.na(A.Level.Name)) %>%
distinct(B.Code, A.Level.Name) %>%
nrow()
total_ingredients <- diet_data_filtered %>%
filter(!is.na(D.Item)) %>%
distinct(B.Code, D.Item) %>%
nrow()
# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 420
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 827
library(dplyr)
library(stringr)
library(tidyr)
# ──────────────────────────────────────────────────────────────────────────────
# 0) Keep only ingredient GE (not entire diet) and choose ONE value per item
# using a priority order of sources you already created upstream.
# Adjust the order if you prefer a different hierarchy.
# ──────────────────────────────────────────────────────────────────────────────
ge_priority <- c(
"raw data",
"feedipedia",
"predicted_from_Weiss_GE",
"predicted_from_Weiss_DE",
"inferred_from_same_ingredient-Step2",
"inferred_from_same_ingredient",
"assumed_zero"
)
GE_ingredients_best <- Gross_energy %>%
filter( DC.Variable == "GE") %>%
mutate(
# normalize source label text
GE_source = str_squish(tolower(GE_source)),
# ensure unit is MJ/kg
DC.Unit = "MJ/kg",
# rank by source priority
src_rank = match(GE_source, ge_priority)
) %>%
arrange(B.Code, D.Item, src_rank, desc(!is.na(DC.Value))) %>%
group_by(B.Code, D.Item) %>%
# keep the single best row per (B.Code, D.Item)
slice_head(n = 1) %>%
ungroup() %>%
transmute(
B.Code,
D.Item,
GE_MJ_per_kg = as.numeric(DC.Value),
GE_source
)
# ──────────────────────────────────────────────────────────────────────────────
# 1) Ingredients only + attach GE + compute per-ingredient GE intake (MJ/day)
# NOTE: we keep your existing columns intact and just append new ones.
# ──────────────────────────────────────────────────────────────────────────────
ingredients_enriched <- ingredients %>%
# enforce "ingredients only" (i.e., exclude entire-diet pseudo-rows if present)
filter(D.Type != "Entire Diet") %>%
# join GE per (B.Code, D.Item)
left_join(GE_ingredients_best, by = c("B.Code", "D.Item")) %>%
# convert amounts to kg/day (where possible)
mutate(
D.Unit.Amount = str_squish(tolower(D.Unit.Amount)),
Amount_kg_day = case_when(
D.Unit.Amount == "kg" ~ D.Amount,
D.Unit.Amount == "g" ~ D.Amount / 1000,
TRUE ~ NA_real_ # unsupported units remain NA
),
# GE intake contributed by this ingredient (MJ/day)
GE_MJ_day = if_else(!is.na(GE_MJ_per_kg) & !is.na(Amount_kg_day),
GE_MJ_per_kg * Amount_kg_day,
NA_real_)
)
# ──────────────────────────────────────────────────────────────────────────────
# 2) (Optional) Quick diagnostics
# ──────────────────────────────────────────────────────────────────────────────
n_total_ing <- nrow(ingredients_enriched)
n_with_GE <- sum(!is.na(ingredients_enriched$GE_MJ_per_kg))
n_with_GEint <- sum(!is.na(ingredients_enriched$GE_MJ_day))
message("Ingredients total: ", n_total_ing,
" | with GE (MJ/kg): ", n_with_GE,
" | with GE intake (MJ/day): ", n_with_GEint)
## Ingredients total: 2879 | with GE (MJ/kg): 2521 | with GE intake (MJ/day): 2157
# If you want to keep only your original columns + added GE columns in a view:
ingredients_view <- ingredients_enriched %>%
select(
B.Code, A.Level.Name, D.Item, D.Type, D.Amount, D.Unit.Amount, D.Unit.Time,
D.Unit.Animals, DC.Is.Dry, D.Ad.lib, D.Item.Group, Source, Units,
T.Animals, Out.WG.Start, Out.WG.Unit, basal_intake_status,
Prediction_Source, Prediction_Source.from_AName, from_feed_intake,
T.Name, T.Control,
# New appended columns:
GE_MJ_per_kg, GE_source, Amount_kg_day, GE_MJ_day
)
| Variable | Category I (High‑starch) | Category II (High‑fat) | Category III (Protein) | Category IV (Good forage) | Category V (Poor forage) | Category VI (Mineral/Vit) |
|---|---|---|---|---|---|---|
| GE (MJ kg⁻¹ DM) | 170 – 190 | 300 – 380 | 180 – 200 | 160 – 180 | 150 – 170 | — |
| Ash (g kg⁻¹ DM) | 20 – 40 | 0 – 10 | 50 – 80 | 70 – 100 | 60 – 100 | ≥ 500 |
| CP (g kg⁻¹ DM) | 80 –120 | 0 – 50 | 350 – 500 | 120 – 200 | 30 – 80 | ≈ 0 |
| EE (g kg⁻¹ DM) | 20 – 50 | 800 – 990 | 20 – 60 | 10 – 30 | 0 – 20 | < 1 |
| NDF (g kg⁻¹ DM) | 100 – 200 | 0 – 50 | 100 – 300 | 400 – 500 | 600 – 750 | — |
| Method | R function | Core concept | Variables needed | One‑sentence rule |
|---|---|---|---|---|
| 1 | method1() |
Exact‑window filter | GE, Ash, CP, EE, NDF (all present) |
Assign to the first category in which every variable falls inside its strict baseline window. |
| 2 | method2() |
Exact ± 10 % filter | Same five | Same as M1 but each numeric bound is widened ± 10 %. |
| 3 | method3() |
Priority‑variable filter (strict) | Only the priority variables per
category: • I → NDF • II → EE • III–V → CP & NDF • VI → Ash |
If all priority variables exist and meet their strict windows, assign that category; missing non‑priority data are ignored. |
| 4 | method4() |
Priority‑variable ± 10 % | Same | Priority windows widened ± 10 %; non‑priority values (if present) still use strict windows and can only disqualify. |
| 5 | method5() |
Hierarchical decision tree (strict) | Any subset of GE, Ash, CP, EE, NDF | Gates tested in order:EE ≥ 80 → II → else
Ash ≥ 50 → VI → else
CP ≥ 35 & NDF ≤ 30 → III → else
NDF ≤ 20 → I → else NDF ≥ 60 → V → else
40 ≤ NDF ≤ 50 → IV → else unclassified. |
| 6 | method6() |
Hierarchical tree ± 10 % | Same | Same gates as M5 but every numeric constant is multiplied by 0.9 (lower) and 1.1 (upper) before testing. |
| 7 | method7() |
Hierarchical ± 10 % plus text overrides | Same numerics + D.Item,
D.Type |
Start with Method 6 result, then override: • oil|tallow|fat|grease in D.Item →
II• protein‑meal regex (distillers grain, soybean meal, …) → III • D.Type ∈ “Supplement”
or matches “supplement” → VI |
All concentration variables are in g·kg⁻¹ DM except GE (MJ·kg⁻¹ DM).
Ing_classif <- merged_metadata$Animals.Diet.Comp
diets <- ingredients
Ing_classif <- Ing_classif %>%
filter(!D.Item %in% c("Salt", "Water"))
# mapping: variant (keep name) → donor (copy nutrients from here)
nutrient_donor_map <- tribble(
~variant, ~donor,
"Leucaena leucocephala Dried Ground", "Leucaena leucocephala Leaves Ground",
"Sheanut Cake Boiled Dried", "Sheanut Cake Ground",
"maize straw dried ensiled ground urea treated", "maize straw ensiled urea treated",
"leucaena leucocephala leaves dried", "leucaena leucocephala leaves ground",
"gliciridia sepium ground", "gliciridia sepium leaves dried ground",
"sunflower", "sunflower seed",
"moringa stenopetala leaves dried", "moringa stenopetala dried",
"sesbania sesban leaves dried", "sesbania sesban leaves and twigs dried",
"urochloa hybrid mulato ii dried", "urochloa mosambicensis dried",
"gliciridia sepium leaves ground", "gliciridia sepium leaves dried ground",
"leucaena leucocephala leaves dried ground", "leucaena leucocephala leaves ground",
"wheat straw effective microbes", "wheat straw",
"maize & wheat offal","maize offal",
"leucaena leucocephala dried","leucaena leucocephala leaves ground"
)
# Normalize to lowercase for matching
nutrient_donor_map <- nutrient_donor_map %>%
mutate(across(everything(), tolower))
Ing_classif <- Ing_classif %>%
mutate(D.Item_lc = tolower(D.Item))
# --- Step 1: extract donor profiles
donor_profiles <- Ing_classif %>%
mutate(D.Item_lc = tolower(D.Item)) %>%
filter(D.Item_lc %in% nutrient_donor_map$donor)
# --- Step 2: build replacement rows
replacement_rows <- nutrient_donor_map %>%
rename(Donor_lc = donor, Variant_lc = variant) %>%
inner_join(donor_profiles, by = c("Donor_lc" = "D.Item_lc")) %>%
mutate(
D.Item_lc = Variant_lc, # replace with variant name
D.Item = str_to_title(Variant_lc) # keep nicer capitalization
)
# --- Step 3: remove old variant rows and add replacements
Ing_classif <- Ing_classif %>%
filter(!D.Item_lc %in% nutrient_donor_map$variant) %>% # drop old variants
bind_rows(replacement_rows %>% select(names(Ing_classif))) # add clones
# --- Helpers ---------------------------------------------------------------
clean_key <- function(x) x %>% str_trim() %>% str_squish() %>% str_to_lower()
# Canonicalize names so duplicates collapse:
# "unspecified molasses" → "molasses"
# any "bran" → "offal"
canonicalize_item <- function(x) {
clean_key(x) %>%
str_replace("^unspecified\\s+molasses$", "molasses") %>%
str_replace_all("\\bbran\\b", "offal")
}
# Robust logical checks for columns stored as logical or "TRUE"/"FALSE"
is_false_or_na <- function(x) is.na(x) | x == FALSE | x == "FALSE"
is_true <- function(x) !is.na(x) & (x == TRUE | x == "TRUE")
# Global D.Item → D.Type (built on canonical names)
type_lookup <- diets %>%
transmute(
D.Item = canonicalize_item(D.Item),
D.Type = str_trim(D.Type)
) %>%
distinct(D.Item, .keep_all = TRUE)
# Canonicalize Ing_classif once
Ing_base <- Ing_classif %>%
mutate(
B.Code = str_trim(B.Code),
D.Item_raw = D.Item,
D.Item = canonicalize_item(D.Item)
)
# Build (pivot-wide + attach D.Type; optional back-fill from global item means)
build_nutr_table <- function(df_long, type_lookup, backfill_by_item = TRUE) {
target_vars <- c("CP","EE","NDF","Ash","GE")
nutr <- df_long %>%
filter(DC.Variable %in% target_vars) %>%
mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>%
pivot_wider(
id_cols = c(B.Code, D.Item),
names_from = DC.Variable,
values_from = DC.Value,
values_fn = ~ mean(.x, na.rm = TRUE),
values_fill = NA
) %>%
left_join(type_lookup, by = "D.Item")
if (backfill_by_item) {
global_means <- df_long %>%
filter(DC.Variable %in% target_vars) %>%
mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>%
group_by(D.Item, DC.Variable) %>%
summarise(Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = DC.Variable, values_from = Value)
nutr <- nutr %>%
left_join(global_means, by = "D.Item", suffix = c("", ".global")) %>%
mutate(
GE = coalesce(GE, GE.global),
Ash = coalesce(Ash, Ash.global),
CP = coalesce(CP, CP.global),
EE = coalesce(EE, EE.global),
NDF = coalesce(NDF, NDF.global)
) %>%
select(-ends_with(".global"))
}
nutr
}
# Ingredients subset (non-entire, non-group) — WITH back-fill
Ing_ingredients_long <- Ing_base %>%
filter(is_false_or_na(is_entire_diet)) %>%
filter(is_false_or_na(is_group))
nutr_ing <- build_nutr_table(Ing_ingredients_long, type_lookup, backfill_by_item = TRUE) %>%
mutate(scope = "ingredient") %>%
relocate(scope, .before = B.Code)
# Diets subset (entire diets, non-group) — NO back-fill
Ing_diets_long <- Ing_base %>%
filter(is_true(is_entire_diet)) %>%
filter(is_false_or_na(is_group))
nutr_diet <- build_nutr_table(Ing_diets_long, type_lookup, backfill_by_item = FALSE) %>%
mutate(scope = "diet") %>%
relocate(scope, .before = B.Code)
# Combined (optional)
nutr_all <- bind_rows(nutr_ing, nutr_diet)
# Diagnostics: items without a mapped D.Type
unmatched_ing <- anti_join(nutr_ing, type_lookup, by = "D.Item")
unmatched_diet <- anti_join(nutr_diet, type_lookup, by = "D.Item")
# ── Canonical mean per ingredient (collapse to one row per D.Item) ──────────
canon_ing <- nutr_ing %>%
group_by(D.Item) %>%
summarise(
GE = mean(GE, na.rm = TRUE),
Ash = mean(Ash, na.rm = TRUE),
CP = mean(CP, na.rm = TRUE),
EE = mean(EE, na.rm = TRUE),
NDF = mean(NDF, na.rm = TRUE),
D.Type = first(D.Type),
.groups = "drop"
)
# Numeric windows (g/kg DM; GE in MJ/kg DM)
win <- list(
GE = list(I=c(170,190), II=c(300,380), III=c(180,200), IV=c(160,180), V=c(150,170)),
Ash = list(I=c(20,40), II=c(0,10), III=c(50,80), IV=c(70,100), V=c(60,100), VI=c(500,1000)),
CP = list(I=c(80,120), II=c(0,50), III=c(350,500), IV=c(120,200), V=c(30,80)),
EE = list(I=c(20,50), II=c(80,990), III=c(20,60), IV=c(10,30), V=c(0,20)),
NDF = list(I=c(100,200), II=c(0,50), III=c(100,300), IV=c(400,500), V=c(600,750))
)
expand10 <- function(x) c(x[1]*0.9, x[2]*1.1)
inside <- function(x, rng) !is.na(x) & x >= rng[1] & x <= rng[2]
# Priority variables
priority <- list(
I = c("NDF"),
II = c("EE"),
III= c("CP","NDF"),
IV = c("NDF","CP"),
V = c("NDF","CP"),
VI = c("Ash")
)
# M1 strict all-five
method1 <- function(df, w = win){
df %>% mutate(
M1_Category = case_when(
inside(GE,w$GE$I) & inside(Ash,w$Ash$I) & inside(CP,w$CP$I) & inside(EE,w$EE$I) & inside(NDF,w$NDF$I) ~ "I",
inside(GE,w$GE$II) & inside(Ash,w$Ash$II) & inside(CP,w$CP$II) & inside(EE,w$EE$II) & inside(NDF,w$NDF$II) ~ "II",
inside(GE,w$GE$III) & inside(Ash,w$Ash$III)& inside(CP,w$CP$III) & inside(EE,w$EE$III) & inside(NDF,w$NDF$III) ~ "III",
inside(GE,w$GE$IV) & inside(Ash,w$Ash$IV) & inside(CP,w$CP$IV) & inside(EE,w$EE$IV) & inside(NDF,w$NDF$IV) ~ "IV",
inside(GE,w$GE$V) & inside(Ash,w$Ash$V) & inside(CP,w$CP$V) & inside(EE,w$EE$V) & inside(NDF,w$NDF$V) ~ "V",
inside(Ash,w$Ash$VI) ~ "VI",
TRUE ~ NA_character_
)
)
}
# M2 ±10% all-five
method2 <- function(df){
w10 <- map(win, ~ map(.x, expand10))
method1(df, w10) %>% rename(M2_Category = M1_Category)
}
# Priority helper
method_priority <- function(df, w, out_name){
df %>% rowwise() %>% mutate(
"{out_name}" := {
dat <- c(CP=CP, EE=EE, NDF=NDF, Ash=Ash, GE=GE)
cat <- NA_character_
for (cl in names(priority)) {
req <- priority[[cl]]
if (all(!is.na(dat[req])) &&
all(map2_lgl(dat[req], req, \(v,r) inside(v, w[[r]][[cl]])))) {
cat <- cl; break
}
}
cat
}
) %>% ungroup()
}
method3 <- function(df) method_priority(df, win, "M3_Category")
method4 <- function(df){ w10 <- map(win, ~ map(.x, expand10)); method_priority(df, w10, "M4_Category") }
# Decision tree
tree_decide <- function(df, w){
df %>% mutate(
Category = case_when(
EE >= w$EE$II[1] ~ "II",
Ash >= w$Ash$VI[1] ~ "VI",
CP >= w$CP$III[1] & NDF <= w$NDF$III[2] ~ "III",
NDF <= w$NDF$I[2] ~ "I",
NDF >= w$NDF$V[1] ~ "V",
NDF >= w$NDF$IV[1] & NDF <= w$NDF$IV[2] ~ "IV",
TRUE ~ NA_character_
)
)
}
method5 <- function(df) tree_decide(df, win) %>% rename(M5_Category = Category)
method6 <- function(df){ w10 <- map(win, ~ map(.x, expand10)); tree_decide(df, w10) %>% rename(M6_Category = Category) }
# Method 7: start from M6 and apply name/type overrides
method7 <- function(df){
out <- if ("M6_Category" %in% names(df)) df else method6(df)
out$M7_Category <- out$M6_Category
# normalize item names
item_norm <- canonicalize_item(out$D.Item)
# --- Category II: fats/oils ---
oil_pat <- "\\b(oil|tallow|grease|lard)\\b"
is_oil <- str_detect(item_norm, regex(oil_pat, ignore_case = TRUE))
out$M7_Category[is_oil] <- "II"
# --- Category III: protein sources ---
protein_pat <- "\\b(distillers?.*grain|soybean ground|canola meal|fish ground|blood ground)\\b"
is_pmeal <- str_detect(item_norm, regex(protein_pat, ignore_case = TRUE))
is_fish <- str_detect(item_norm, regex("\\bfish\\b", ignore_case = TRUE)) &
!str_detect(item_norm, regex("\\bfish\\s*oil\\b", ignore_case = TRUE))
is_animprot <- str_detect(item_norm, regex("\\b(feather|larvae|snail|termite|shrimp|hydrolysate)\\b",
ignore_case = TRUE))
out$M7_Category[is_pmeal | is_fish | is_animprot] <- "III"
# --- Category I: byproducts (bread waste only) ---
is_bread <- str_detect(item_norm, regex("\\bbread waste\\b", ignore_case = TRUE))
out$M7_Category[is_bread] <- "I"
# --- Category VI: supplements / additives ---
sup_types <- c("Supplement", "Non-ERA Feed")
vi_type_hit <- !is.na(out$D.Type) &
(out$D.Type %in% sup_types |
str_detect(out$D.Type, regex("supplement", ignore_case = TRUE)))
is_shell <- str_detect(item_norm, regex("\\boyster\\s*shells?\\b", ignore_case = TRUE))
is_cellulose <- str_detect(
item_norm,
regex("\\b(cellulose|micro\\s*-?crystalline\\s+cellulose|carboxy\\s*methyl\\s*cellulose|\\bMCC\\b|\\bCMC\\b)\\b",
ignore_case = TRUE)
)
is_nonfeed <- str_detect(item_norm,
regex("\\b(cement|polyethylene|humate|probiotic|solution|mixture|axtra xap)\\b",
ignore_case = TRUE))
out$M7_Category[vi_type_hit | is_shell | is_cellulose | is_nonfeed] <- "VI"
out
}
# Run all methods so we can inspect coverage; M7 uses M6 base + overrides
classif_ing <- canon_ing %>% # ← use canonical means for INGREDIENTS
method1() %>% method2() %>% method3() %>% method4() %>%
method5() %>% method6() %>% method7()
classif_diet <- nutr_diet %>% # ← keep DIETS per-row (no mean collapse)
method1() %>% method2() %>% method3() %>% method4() %>%
method5() %>% method6() %>% method7()
# Helper to build coverage/counts/samples/unclassified per table
build_reports <- function(df){
method_cols <- grep("^M[1-7]_Category$", names(df), value = TRUE)
coverage_tbl <- purrr::map_dfr(method_cols, function(mc){
tibble(
Method = sub("_Category$", "", mc),
Total = nrow(df),
Assigned = sum(!is.na(df[[mc]])),
Coverage = round(Assigned / Total * 100, 1)
)
})
counts_tbl <- df %>%
pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
filter(!is.na(Category)) %>%
mutate(Method = sub("_Category$", "", Method)) %>%
count(Method, Category, name = "n") %>%
arrange(Method, desc(n))
set.seed(123)
sample_tbl <- df %>%
pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
filter(!is.na(Category)) %>%
mutate(Method = sub("_Category$", "", Method)) %>%
group_by(Method, Category) %>%
# take distinct items per (Method,Category), sample up to 10 per group
group_modify(function(.x, .y) {
.x %>%
distinct(D.Item, .keep_all = TRUE) %>%
slice_sample(n = min(10, nrow(.)))
}) %>%
# still grouped by Method,Category here → summarise keeps those cols
summarise(Examples = paste(D.Item, collapse = "; "), .groups = "drop") %>%
arrange(Method, Category)
unclassified_tbl <- df %>%
pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
filter(is.na(Category)) %>%
mutate(Method = sub("_Category$", "", Method)) %>%
count(Method, D.Item, sort = TRUE) %>%
group_by(Method) %>%
slice_max(n, n = 10, with_ties = FALSE) %>%
ungroup()
list(coverage = coverage_tbl, counts = counts_tbl,
samples = sample_tbl, unclassified = unclassified_tbl)
}
rep_ing <- build_reports(classif_ing)
rep_diet <- build_reports(classif_diet)
# Display (ingredients)
datatable(classif_ing %>% arrange(M7_Category),
caption = "INGREDIENTS: Classified rows (sorted by M7)",
rownames = FALSE, options = list(pageLength = 30, dom = "tip"))
datatable(rep_ing$coverage, caption = "INGREDIENTS: Coverage by method", options = list(pageLength = 10, dom = "tip"))
datatable(rep_ing$counts, caption = "INGREDIENTS: Counts per category", options = list(pageLength = 25, dom = "tip"))
datatable(rep_ing$samples, caption = "INGREDIENTS: Random examples", options = list(pageLength = 25, dom = "tip"))
datatable(rep_ing$unclassified,caption = "INGREDIENTS: Top-10 unclassified", options = list(pageLength = 25, dom = "tip"))
# Display (diets)
datatable(classif_diet %>% arrange(M7_Category),
caption = "DIETS: Classified rows (sorted by M7)",
rownames = FALSE, options = list(pageLength = 30, dom = "tip"))
datatable(rep_diet$coverage, caption = "DIETS: Coverage by method", options = list(pageLength = 10, dom = "tip"))
datatable(rep_diet$counts, caption = "DIETS: Counts per category", options = list(pageLength = 25, dom = "tip"))
datatable(rep_diet$samples, caption = "DIETS: Random examples", options = list(pageLength = 25, dom = "tip"))
datatable(rep_diet$unclassified,caption = "DIETS: Top-10 unclassified", options = list(pageLength = 25, dom = "tip"))
# Canonicalize D.Item and A.Level.Name in ingredients before joining
ingredients_joinable <- ingredients %>%
mutate(
D.Item = canonicalize_item(D.Item),
A.Level.Name = canonicalize_item(A.Level.Name)
)
# Simplify classification tables (already canonicalized)
classif_ing_simple <- classif_ing %>%
select(D.Item, Ingredient_Category = M7_Category)
classif_diet_simple <- classif_diet %>%
rename(A.Level.Name = D.Item) %>%
select(A.Level.Name, Diet_Category = M7_Category)
# Join the classifications
ingredients<- ingredients_joinable %>%
left_join(classif_ing_simple, by = "D.Item")%>%
distinct()
# --- Final polish: map Roman numerals -> full names in ingredients ---
roman_to_name <- function(x) dplyr::recode(
x,
"I" = "High-starch",
"II" = "High-fat",
"III" = "Protein",
"IV" = "Good forage",
"V" = "Poor forage",
"VI" = "Mineral/Vitamin",
.default = x,
.missing = NA_character_
)
# Update the ingredient database
ingredients <- ingredients %>%
dplyr::mutate(
Ingredient_Category = roman_to_name(Ingredient_Category))
# If you also keep a diet-level tag, uncomment:
# , Diet_Category = roman_to_name(Diet_Category)
# ----------------------------
# Completeness within selected papers
# ----------------------------
# Selected papers (already built above)
selected_bcodes <- GE_data # a data.frame with B.Code from your earlier step
# Ensure per-row INGREDIENT classifications exist (ingredients, by B.Code)
classif_ing_rows <- nutr_ing %>%
method1() %>% method2() %>% method3() %>% method4() %>%
method5() %>% method6() %>% method7()
# Choose which method’s category to test for completeness
method_col <- "M7_Category"
# --- Diet completeness (all diet rows classified for a paper) ---
diet_status <- classif_diet %>%
semi_join(selected_bcodes, by = "B.Code") %>%
group_by(B.Code) %>%
summarise(
diet_n = n(),
diet_n_classified = sum(!is.na(.data[[method_col]])),
diets_all_classified = diet_n > 0 & (diet_n_classified == diet_n),
.groups = "drop"
)
# --- Ingredient completeness (all ingredient rows classified for a paper) ---
ing_status <- classif_ing_rows %>%
semi_join(selected_bcodes, by = "B.Code") %>%
group_by(B.Code) %>%
summarise(
ing_n = n(),
ing_n_classified = sum(!is.na(.data[[method_col]])),
ingredients_all_classified = ing_n > 0 & (ing_n_classified == ing_n),
.groups = "drop"
)
# Combine statuses; treat missing as FALSE (paper may have diets but no ingredients, or vice versa)
paper_status <- full_join(diet_status, ing_status, by = "B.Code") %>%
mutate(
diets_all_classified = replace_na(diets_all_classified, FALSE),
ingredients_all_classified = replace_na(ingredients_all_classified, FALSE),
either_complete = diets_all_classified | ingredients_all_classified
)
# Attach species for those selected papers (Cattle/Sheep/Goat)
paper_status <- paper_status %>%
left_join(
merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product),
by = "B.Code"
)
# ---------- Totals ----------
n_papers_diets_all <- sum(paper_status$diets_all_classified)
n_papers_ingredients_all <- sum(paper_status$ingredients_all_classified)
n_papers_either <- sum(paper_status$either_complete)
n_papers_diets_all
## [1] 61
n_papers_ingredients_all
## [1] 49
n_papers_either
## [1] 98
# ---------- By species ----------
summary_by_product <- paper_status %>%
group_by(P.Product) %>%
summarise(
Papers_total = n_distinct(B.Code),
Papers_diets_all = sum(diets_all_classified),
Papers_ingredients_all = sum(ingredients_all_classified),
Papers_either = sum(either_complete),
.groups = "drop"
)
# View as a table
kable(summary_by_product, caption = "Papers with complete diet/ingredient classification (within selected papers)")
| P.Product | Papers_total | Papers_diets_all | Papers_ingredients_all | Papers_either |
|---|---|---|---|---|
| Cattle | 32 | 15 | 10 | 21 |
| Goat | 67 | 23 | 15 | 35 |
| Sheep | 68 | 23 | 24 | 42 |
ingredients <-ingredients%>%
filter(D.Item!="salt")
unmatched_ing_m7 <- ingredients %>%
filter(is.na(Ingredient_Category)) %>%
distinct(D.Item) %>%
left_join(canon_ing, by = "D.Item") %>%
arrange(D.Item)
# ---------- Windows & priority (your spec) ----------
win <- list(
GE = list(I=c(170,190), II=c(300,380), III=c(180,200), IV=c(160,180), V=c(150,170)),
Ash = list(I=c(20,40), II=c(0,10), III=c(50,80), IV=c(70,100), V=c(60,100), VI=c(500,1000)),
CP = list(I=c(80,120), II=c(0,50), III=c(350,500), IV=c(120,200), V=c(30,80)),
EE = list(I=c(20,50), II=c(80,990), III=c(20,60), IV=c(10,30), V=c(0,20)),
NDF = list(I=c(100,200), II=c(0,50), III=c(100,300), IV=c(400,500), V=c(600,750))
)
priority <- list(
I = c("NDF","CP"),
II = c("EE"),
III= c("CP","NDF"),
IV = c("NDF","CP"),
V = c("NDF","CP"),
VI = c("Ash")
)
expand10 <- function(x) c(x[1]*0.9, x[2]*1.1)
mid <- function(r) mean(r, na.rm = TRUE)
w10 <- map(win, ~ map(.x, expand10)) # ±10% windows (Method 6 spirit)
# ---------- 1) Build the working table from *your* dataset ----------
# Expecting unmatched_ing_m7 to already contain D.Item (+ optionally D.Type) and nutrients
unmatched <- unmatched_ing_m7 %>%
select(any_of(c("D.Item","D.Type","GE","Ash","CP","EE","NDF"))) %>%
group_by(across(any_of(c("D.Item","D.Type")))) %>%
summarise(across(c(GE, Ash, CP, EE, NDF),
~ suppressWarnings(mean(as.numeric(.), na.rm = TRUE))),
.groups = "drop") %>%
mutate(across(c(GE, Ash, CP, EE, NDF), ~ ifelse(is.nan(.), NA_real_, .))) %>%
# drop rows where ALL five nutrients are NA
filter(!(is.na(GE) & is.na(Ash) & is.na(CP) & is.na(EE) & is.na(NDF))) %>%
mutate(
n_values = rowSums(!is.na(across(c(GE, Ash, CP, EE, NDF)))),
Missing = pmap_chr(select(., GE, Ash, CP, EE, NDF), ~ {
nm <- c("GE","Ash","CP","EE","NDF"); vv <- c(...); miss <- nm[is.na(vv)]
if (length(miss)==0) "—" else paste(miss, collapse = ", ")
})
)
# ---------- 2) Priority-based score with HARD GATES ----------
# Standardized violation (0 if inside; >0 if outside by fraction of window width)
std_violation <- function(x, rng) {
if (is.na(x) || any(is.na(rng))) return(NA_real_)
L <- rng[1]; U <- rng[2]; W <- U - L; if (W <= 0) return(NA_real_)
if (x < L) (L - x)/W else if (x > U) (x - U)/W else 0
}
cats <- names(priority)
missing_penalty <- 0.75 # penalty if a priority var is missing
scores <- matrix(NA_real_, nrow = nrow(unmatched), ncol = length(cats))
colnames(scores) <- cats
# Hard gates (fixes the “all go to II” issue):
gate_ii_lb <- w10$EE$II[1] # 72 (0.9 * 80)
gate_vi_lb <- w10$Ash$VI[1] # 450 (0.9 * 500)
for (j in seq_along(cats)) {
cat <- cats[j]
req <- priority[[cat]]
# Per-variable violations for this category
vlist <- lapply(req, function(v) {
x <- unmatched[[v]] %>% as.numeric()
rng <- w10[[v]][[cat]]
sapply(x, std_violation, rng = rng)
})
vmat <- do.call(cbind, vlist)
# Replace NA violations (missing values) with bounded penalty
vmat[is.na(vmat)] <- missing_penalty
# Mean violation across this category’s priority vars
s <- rowMeans(vmat)
# Apply gates:
if (cat == "II") {
ee <- unmatched$EE
s[is.na(ee) | ee < gate_ii_lb] <- Inf
}
if (cat == "VI") {
ash <- unmatched$Ash
s[is.na(ash) | ash < gate_vi_lb] <- Inf
}
scores[, j] <- s
}
best_idx <- apply(scores, 1, which.min)
second_idx <- apply(scores, 1, function(v) order(v, na.last = TRUE)[2])
Assigned_Category <- cats[best_idx]
Distance_Closest <- scores[cbind(seq_len(nrow(scores)), best_idx)]
Second_Best_Category <- cats[second_idx]
Distance_Second <- scores[cbind(seq_len(nrow(scores)), second_idx)]
# How many priority vars actually present for the chosen cat?
n_used_priority <- map2_int(seq_len(nrow(unmatched)), best_idx, function(i, j) {
req <- priority[[cats[j]]]
sum(!is.na(unmatched[i, req]))
})
Priority_vars_used <- map2_chr(seq_len(nrow(unmatched)), best_idx, function(i, j) {
req <- priority[[cats[j]]]; have <- req[!is.na(unmatched[i, req])]
if (length(have)==0) "—" else paste(have, collapse = ", ")
})
Priority_vars_missing <- map2_chr(seq_len(nrow(unmatched)), best_idx, function(i, j) {
req <- priority[[cats[j]]]; miss <- req[is.na(unmatched[i, req])]
if (length(miss)==0) "—" else paste(miss, collapse = ", ")
})
results <- unmatched %>%
mutate(
Assigned_Category = Assigned_Category,
Distance_Closest = round(Distance_Closest, 3),
Second_Best_Category = Second_Best_Category,
Distance_Second = round(Distance_Second, 3),
n_used_priority = n_used_priority,
Priority_vars_used = Priority_vars_used,
Priority_vars_missing= Priority_vars_missing
) %>%
arrange(Assigned_Category, Distance_Closest, desc(n_values), D.Item)
DT::datatable(
results %>% select(D.Item, any_of("D.Type"), GE, Ash, CP, EE, NDF,
n_values, Missing,
Assigned_Category, Distance_Closest,
Second_Best_Category, Distance_Second,
n_used_priority, Priority_vars_used, Priority_vars_missing),
caption = "M7-unclassified (from unmatched_ing_m7) → priority-window assignment (±10%), with gates for II/VI",
options = list(pageLength = 50, dom = "tip"),
rownames = FALSE
)
# ---------- 3) Quick plots (centroids midpoints + item segments) ----------
centroids_full <- tibble(
Category = c("I","II","III","IV","V","VI"),
GE = c(mid(win$GE$I), mid(win$GE$II), mid(win$GE$III), mid(win$GE$IV), mid(win$GE$V), NA_real_),
Ash = c(mid(win$Ash$I), mid(win$Ash$II), mid(win$Ash$III), mid(win$Ash$IV), mid(win$Ash$V), mid(win$Ash$VI)),
CP = c(mid(win$CP$I), mid(win$CP$II), mid(win$CP$III), mid(win$CP$IV), mid(win$CP$V), NA_real_),
EE = c(mid(win$EE$I), mid(win$EE$II), mid(win$EE$III), mid(win$EE$IV), mid(win$EE$V), NA_real_),
NDF = c(mid(win$NDF$I), mid(win$NDF$II), mid(win$NDF$III), mid(win$NDF$IV), mid(win$NDF$V), NA_real_)
)
plot_pair <- function(items_df, cents_df, xvar, yvar) {
items_xy <- items_df %>% filter(!is.na(.data[[xvar]]), !is.na(.data[[yvar]]))
cents_xy <- cents_df %>% filter(!is.na(.data[[xvar]]), !is.na(.data[[yvar]])) %>%
transmute(Category, cx = .data[[xvar]], cy = .data[[yvar]])
items_xy <- items_xy %>% left_join(cents_xy, by = c("Assigned_Category" = "Category"))
ggplot() +
geom_point(data = items_xy,
aes(x = .data[[xvar]], y = .data[[yvar]], color = Assigned_Category),
size = 2, alpha = 0.9) +
geom_segment(data = items_xy %>% filter(!is.na(cx), !is.na(cy)),
aes(x = .data[[xvar]], y = .data[[yvar]], xend = cx, yend = cy, color = Assigned_Category),
alpha = 0.4, linewidth = 0.6) +
geom_point(data = cents_xy, aes(x = cx, y = cy), shape = 4, size = 4, stroke = 1.1) +
geom_text(data = cents_xy, aes(x = cx, y = cy, label = Category), vjust = -1, fontface = "bold") +
labs(x = xvar, y = yvar, color = "Assigned",
title = paste(xvar, "vs", yvar, "— items and category centroids")) +
theme_minimal()
}
print(plot_pair(results, centroids_full, "CP", "NDF"))
print(plot_pair(results, centroids_full, "EE", "NDF"))
# ---- Assign the first (best) category back to `ingredients` ----
# Roman → full name map (keep if you already have it)
if (!exists("roman_to_name")) {
roman_to_name <- function(x) dplyr::recode(
x,
"I" = "High-starch",
"II" = "High-fat",
"III" = "Protein",
"IV" = "Good forage",
"V" = "Poor forage",
"VI" = "Mineral/Vitamin",
.default = x, .missing = NA_character_
)
}
# Join keys (prefer D.Item + D.Type if you have both)
join_keys <- intersect(c("D.Item","D.Type"), names(ingredients))
# Make the assignment table unique on the join keys (prevents many-to-many)
assign_tbl <- results %>%
mutate(Assigned_Category_Name = roman_to_name(Assigned_Category)) %>%
select(any_of(c("D.Item","D.Type")), Assigned_Category, Assigned_Category_Name) %>%
group_by(across(all_of(join_keys))) %>%
summarise(
Assigned_Category = dplyr::first(Assigned_Category),
Assigned_Category_Name = dplyr::first(Assigned_Category_Name),
.groups = "drop"
)
# Keep a backup of existing labels the first time you run this
if (!"Ingredient_Category_orig" %in% names(ingredients)) {
ingredients <- ingredients %>%
mutate(Ingredient_Category_orig = Ingredient_Category)
}
# Ensure Category_Source exists (so mutate below can reference it)
if (!"Category_Source" %in% names(ingredients)) {
ingredients <- ingredients %>% mutate(Category_Source = NA_character_)
}
# Join and fill ONLY where Ingredient_Category is NA
ingredients <- ingredients %>%
# dplyr >= 1.1: relationship arg suppresses warning if truly many-to-one
left_join(assign_tbl, by = join_keys, relationship = "many-to-one") %>%
mutate(
Ingredient_Category = coalesce(Ingredient_Category, Assigned_Category_Name),
Category_Source = ifelse(
is.na(Ingredient_Category_orig) & !is.na(Assigned_Category_Name),
"Nearest-window (priority, ±10%, gated II/VI)",
Category_Source
)
) %>%
select(-Assigned_Category, -Assigned_Category_Name)
# Quick sanity check
filled_n <- sum(is.na(ingredients$Ingredient_Category_orig) & !is.na(ingredients$Ingredient_Category))
message("Filled missing Ingredient_Category for ", filled_n, " ingredient rows.")
## Filled missing Ingredient_Category for 95 ingredient rows.
ingredients <-ingredients%>%
filter(D.Item!="salt")
unmatched_ing_m7 <- ingredients %>%
filter(is.na(Ingredient_Category)) %>%
distinct(D.Item) %>%
left_join(canon_ing, by = "D.Item") %>%
arrange(D.Item)
# 1) Per-diet completeness: all ingredients classified?
diet_class_status <- ingredients %>%
semi_join(GE_data, by = "B.Code") %>%
filter(!is.na(A.Level.Name), !is.na(D.Item)) %>%
# (Optionally exclude entire-diet placeholder rows if present)
filter(is.na(D.Type) | D.Type != "Entire Diet") %>%
group_by(B.Code, A.Level.Name) %>%
summarise(
n_ing = n(),
n_classified = sum(!is.na(Ingredient_Category)),
all_classified = n_ing > 0 & (n_ing == n_classified),
.groups = "drop"
)
# 2) Papers with at least TWO diets fully classified
papers_2plus <- diet_class_status %>%
group_by(B.Code) %>%
summarise(n_complete_diets = sum(all_classified), .groups = "drop") %>%
filter(n_complete_diets >= 2)
# 3) Attach product and count per product (Cattle / Sheep / Goat)
papers_2plus_by_product <- papers_2plus %>%
left_join(
merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product),
by = "B.Code"
) %>%
count(P.Product, name = "Papers_with_≥2_fully_classified_diets")
# Show the summary table
knitr::kable(
papers_2plus_by_product %>% arrange(P.Product),
caption = "Papers per product with ≥2 diets where all ingredients are classified"
)
| P.Product | Papers_with_≥2_fully_classified_diets |
|---|---|
| Cattle | 15 |
| Goat | 19 |
| Sheep | 42 |
# (Optional) Show the actual B.Codes and how many complete diets each has
knitr::kable(
papers_2plus %>%
left_join(
merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
distinct(B.Code, P.Product),
by = "B.Code"
) %>%
arrange(P.Product, desc(n_complete_diets), B.Code),
caption = "B.Codes with number of fully classified diets (within GE papers)"
)
| B.Code | n_complete_diets | P.Product |
|---|---|---|
| JS0215.2 | 5 | Cattle |
| EM1050 | 4 | Cattle |
| JS0324.1 | 4 | Cattle |
| JS0324.2 | 4 | Cattle |
| NN0207.2 | 4 | Cattle |
| NN0299 | 4 | Cattle |
| BO1033 | 3 | Cattle |
| EM1093 | 3 | Cattle |
| JS0205 | 3 | Cattle |
| NJ0025 | 3 | Cattle |
| CJ1003 | 2 | Cattle |
| HK0086 | 2 | Cattle |
| JS0215.1 | 2 | Cattle |
| LM0101 | 2 | Cattle |
| NN0280.2 | 2 | Cattle |
| JS0208 | 7 | Goat |
| LM0248 | 7 | Goat |
| NN0375 | 6 | Goat |
| BO1015 | 5 | Goat |
| HK0107 | 5 | Goat |
| HK0171 | 5 | Goat |
| BO1076 | 4 | Goat |
| JS0192 | 4 | Goat |
| JS0196 | 4 | Goat |
| JS0197 | 4 | Goat |
| BO1098 | 3 | Goat |
| HK0182 | 3 | Goat |
| HK0350 | 3 | Goat |
| JS0193 | 3 | Goat |
| JS0261 | 3 | Goat |
| LM0280 | 3 | Goat |
| NN0374 | 3 | Goat |
| EO0057 | 2 | Goat |
| NN0092.2 | 2 | Goat |
| JS0291 | 16 | Sheep |
| AN0007 | 5 | Sheep |
| BO1025 | 5 | Sheep |
| BO1051 | 5 | Sheep |
| HK0033 | 5 | Sheep |
| HK0082 | 5 | Sheep |
| JS0301 | 5 | Sheep |
| NJ0008 | 5 | Sheep |
| 0 | 4 | Sheep |
| BO1034 | 4 | Sheep |
| BO1048 | 4 | Sheep |
| BO1050 | 4 | Sheep |
| EM1023 | 4 | Sheep |
| EM1031 | 4 | Sheep |
| EM1034 | 4 | Sheep |
| EO0046 | 4 | Sheep |
| JS0210.1 | 4 | Sheep |
| JS0316 | 4 | Sheep |
| NJ0032 | 4 | Sheep |
| NN0207.1 | 4 | Sheep |
| NN0329 | 4 | Sheep |
| BO1008 | 3 | Sheep |
| CJ1004 | 3 | Sheep |
| CJ1015 | 3 | Sheep |
| EM1085 | 3 | Sheep |
| EM1090 | 3 | Sheep |
| JO1028 | 3 | Sheep |
| JS0210.2 | 3 | Sheep |
| JS0249 | 3 | Sheep |
| JS0273 | 3 | Sheep |
| JS0285 | 3 | Sheep |
| JS0344 | 3 | Sheep |
| JS0419 | 3 | Sheep |
| NJ0044 | 3 | Sheep |
| NN0081 | 3 | Sheep |
| NN0234 | 3 | Sheep |
| BO1059 | 2 | Sheep |
| JO1082 | 2 | Sheep |
| JS0262 | 2 | Sheep |
| JS0350 | 2 | Sheep |
| NN0200 | 2 | Sheep |
| NN0212 | 2 | Sheep |
suppressPackageStartupMessages({
library(dplyr); library(tidyr); library(stringr); library(purrr)
})
classify_diet_practice <- function(
diet_percentages,
total_amounts,
ctrl_key,
classif_ing_simple,
TOTAL_TOL = 0.05, # ±5% total DM offered
ING_TOL = 1.0, # %-point threshold to count a change
BAL_TOL = 3.0 # %-point imbalance allowed for a "balanced swap"
){
ensure_col <- function(df, col, fill = NA_character_) {
if (!col %in% names(df)) df[[col]] <- fill
df
}
norm_item <- function(x){
x %>%
stringr::str_to_lower() %>%
stringr::str_replace_all("[[:space:]]+", " ") %>%
stringr::str_trim()
}
# ---- Ingredient lookup (dedup + normalized) ----
ing_lu <- classif_ing_simple %>%
transmute(
D.Item_norm = norm_item(D.Item),
Ingredient_Category_lu = Ingredient_Category
) %>%
filter(!is.na(D.Item_norm), D.Item_norm != "") %>%
group_by(D.Item_norm) %>%
summarise(Ingredient_Category_lu = dplyr::first(na.omit(Ingredient_Category_lu)),
.groups = "drop")
# ---- Control lookup (1 control per B.Code) ----
ctrl_lu <- ctrl_key %>%
filter(tolower(T.Control) == "yes") %>%
arrange(B.Code, A.Level.Name) %>%
group_by(B.Code) %>%
summarise(Control = first(A.Level.Name), .groups = "drop")
# ---- Totals to kg (aggregate duplicates) ----
totals_kg <- total_amounts %>%
mutate(Total_kg = case_when(
D.Unit.Amount == "g" ~ Total_Amount/1000,
D.Unit.Amount == "kg" ~ Total_Amount,
TRUE ~ NA_real_
)) %>%
group_by(B.Code, A.Level.Name) %>%
summarise(Total_kg = sum(Total_kg, na.rm = TRUE), .groups = "drop")
# ---- Percentages + robust category join ----
dp <- ensure_col(diet_percentages, "Ingredient_Category")
pct_long <- dp %>%
select(B.Code, A.Level.Name, D.Item, Percentage_of_Diet, Ingredient_Category) %>%
group_by(B.Code, A.Level.Name, D.Item, Ingredient_Category) %>%
summarise(Percentage_of_Diet = sum(Percentage_of_Diet, na.rm = TRUE), .groups = "drop") %>%
mutate(D.Item_norm = norm_item(D.Item)) %>%
left_join(ing_lu, by = "D.Item_norm") %>%
mutate(
Ingredient_Category = coalesce(Ingredient_Category, Ingredient_Category_lu, "Unclassified")
) %>%
select(-D.Item_norm, -Ingredient_Category_lu)
# ---- Pairs (treatment vs control) ----
pairs <- pct_long %>%
distinct(B.Code, A.Level.Name) %>%
inner_join(ctrl_lu, by = "B.Code") %>%
filter(A.Level.Name != Control) %>%
transmute(B.Code, Control, Trt = A.Level.Name) %>%
distinct()
trt_tbl <- pct_long %>%
rename(Trt = A.Level.Name, p_trt = Percentage_of_Diet) %>%
select(B.Code, Trt, D.Item, Ingredient_Category, p_trt)
ctrl_tbl <- pct_long %>%
rename(Control = A.Level.Name, p_ctrl = Percentage_of_Diet) %>%
select(B.Code, Control, D.Item, p_ctrl)
trt_vs_ctrl <- pairs %>%
left_join(trt_tbl, by = c("B.Code","Trt")) %>%
left_join(ctrl_tbl, by = c("B.Code","Control","D.Item")) %>%
mutate(
p_trt = coalesce(p_trt, 0),
p_ctrl = coalesce(p_ctrl, 0),
dP = p_trt - p_ctrl,
dir = case_when(
dP >= ING_TOL ~ "increase",
dP <= -ING_TOL ~ "decrease",
TRUE ~ "nochange"
)
)
delta_totals <- pairs %>%
left_join(totals_kg %>% rename(Total_trt_kg = Total_kg),
by = c("B.Code","Trt" = "A.Level.Name")) %>%
left_join(totals_kg %>% rename(Total_ctrl_kg = Total_kg),
by = c("B.Code","Control" = "A.Level.Name")) %>%
mutate(ΔTOTAL = (Total_trt_kg - Total_ctrl_kg)/Total_ctrl_kg)
# Precompute top-3 item changes (avoids the earlier vctrs error)
top_changes <- trt_vs_ctrl %>%
group_by(B.Code, Control, Trt) %>%
arrange(desc(abs(dP)), .by_group = TRUE) %>%
summarise(
Top_changes = paste(
head(sprintf("%s %s%.1f%%", D.Item, if_else(dP>=0,"+",""), dP), 3),
collapse = "; "
),
.groups = "drop"
)
summarize_types <- function(x) paste(sort(unique(na.omit(x))), collapse = ", ")
paired <- trt_vs_ctrl %>%
left_join(delta_totals %>% select(B.Code, Trt, ΔTOTAL), by = c("B.Code","Trt")) %>%
group_by(B.Code, Control, Trt, ΔTOTAL) %>%
summarise(
POS_sum = sum(pmax(dP,0), na.rm = TRUE),
NEG_sum = sum(pmax(-dP,0), na.rm = TRUE),
n_pos = sum(dir=="increase", na.rm = TRUE),
n_neg = sum(dir=="decrease", na.rm = TRUE),
Types_added = summarize_types(Ingredient_Category[dir=="increase"]),
Types_decreased = summarize_types(Ingredient_Category[dir=="decrease"]),
.groups = "drop"
) %>%
left_join(top_changes, by = c("B.Code","Control","Trt")) %>%
mutate(
Practice = case_when(
is.finite(ΔTOTAL) & abs(ΔTOTAL) > TOTAL_TOL & POS_sum >= ING_TOL ~ "Addition",
is.finite(ΔTOTAL) & abs(ΔTOTAL) <= TOTAL_TOL &
POS_sum >= ING_TOL & NEG_sum >= ING_TOL &
abs(POS_sum - NEG_sum) <= BAL_TOL ~ "Substitution",
is.finite(ΔTOTAL) & abs(ΔTOTAL) <= TOTAL_TOL &
POS_sum >= ING_TOL & NEG_sum < ING_TOL ~ "Addition",
(POS_sum + NEG_sum) < ING_TOL ~ "No change",
TRUE ~ "Substitution"
),
Magnitude_pct = case_when(
Practice == "Addition" ~ POS_sum,
Practice == "Substitution" ~ 0.5*(POS_sum + NEG_sum),
TRUE ~ 0
)
) %>%
rowwise() %>%
mutate(
Substitution_pairs = {
if (Practice != "Substitution") {
""
} else {
b <- B.Code; t <- Trt
pos_type <- trt_vs_ctrl %>%
filter(B.Code == b, Trt == t) %>%
group_by(Ingredient_Category) %>%
summarise(pos = sum(pmax(dP,0), na.rm=TRUE), .groups="drop") %>%
filter(pos > 0) %>% arrange(desc(pos))
neg_type <- trt_vs_ctrl %>%
filter(B.Code == b, Trt == t) %>%
group_by(Ingredient_Category) %>%
summarise(neg = sum(pmax(-dP,0), na.rm=TRUE), .groups="drop") %>%
filter(neg > 0) %>% arrange(desc(neg))
np <- min(nrow(pos_type), nrow(neg_type))
if (np == 0) "" else
paste(paste0(neg_type$Ingredient_Category[seq_len(np)],
" → ",
pos_type$Ingredient_Category[seq_len(np)]),
collapse="; ")
}
}
) %>%
ungroup() %>%
arrange(B.Code, Control, Trt)
paired
}
# ── Run it ──
practice_tbl <- classify_diet_practice(
diet_percentages = diet_percentages,
total_amounts = total_amounts,
ctrl_key = ctrl_key,
classif_ing_simple = classif_ing_simple,
TOTAL_TOL = 0.05, ING_TOL = 1.0, BAL_TOL = 3.0
)
suppressPackageStartupMessages({
library(dplyr); library(stringr); library(tidyr); library(purrr)
})
# --- Rebuild the same control lookup used in the function ---
ctrl_lu <- ctrl_key %>%
filter(tolower(T.Control) == "yes") %>%
arrange(B.Code, A.Level.Name) %>%
group_by(B.Code) %>%
summarise(Control = first(A.Level.Name), .groups = "drop")
# --- Diet coverage per paper (from the raw diet_percentages you feed the function) ---
diets_per_code <- diet_percentages %>%
filter(!is.na(B.Code), !is.na(A.Level.Name)) %>%
group_by(B.Code) %>%
summarise(
n_diets = n_distinct(A.Level.Name),
n_rows = n(), # total ingredient rows for those diets
.groups = "drop"
)
# --- Candidates that *should* be classifiable: have a control & at least 2 diets ---
candidates <- diets_per_code %>%
left_join(ctrl_lu %>% mutate(has_control = TRUE), by = "B.Code") %>%
mutate(has_control = coalesce(has_control, FALSE)) %>%
mutate(eligible = has_control & n_diets >= 2)
# --- B.Codes that actually produced a practice row ---
in_practice <- practice_tbl %>% distinct(B.Code)
# --- 1) Papers that could not be classified at all (no rows in practice_tbl) ---
unclassified_papers <- candidates %>%
mutate(in_practice = B.Code %in% in_practice$B.Code) %>%
filter(!in_practice | !eligible) %>% # either not produced, or not eligible
transmute(
B.Code,
n_diets,
has_control,
reason = case_when(
!has_control ~ "No control diet flagged in ctrl_key",
n_diets < 2 ~ "Only one diet available (need ≥ 2)",
has_control & n_diets >= 2 & !in_practice ~ "Pairing failed (no Trt vs Control built)",
TRUE ~ "Other/unknown"
)
) %>%
arrange(desc(has_control), desc(n_diets), B.Code)
# --- 2) Papers that *were* classified, but every comparison is "No change" ---
no_change_papers <- practice_tbl %>%
group_by(B.Code) %>%
summarise(
n_pairs = n(), # number of Trt vs Control pairs
n_no_change = sum(Practice == "No change"),
all_no_change = n_pairs > 0 & n_no_change == n_pairs,
.groups = "drop"
) %>%
filter(all_no_change) %>%
arrange(desc(n_pairs))
# (Optional) attach product/species to both tables
prod_lu <- merged_metadata$`Prod.Out` %>%
distinct(B.Code, P.Product)
unclassified_papers <- unclassified_papers %>%
left_join(prod_lu, by = "B.Code") %>%
relocate(P.Product, .after = B.Code)
no_change_papers <- no_change_papers %>%
left_join(prod_lu, by = "B.Code") %>%
relocate(P.Product, .after = B.Code)
# --- Quick summaries you can show in a slide ---
summary_unclassified <- unclassified_papers %>%
count(reason, name = "n_papers") %>%
arrange(desc(n_papers))
summary_no_change_by_prod <- no_change_papers %>%
count(P.Product, name = "n_papers_all_no_change")
# Peek
unclassified_papers %>% head(10)
## # A tibble: 10 × 5
## B.Code P.Product n_diets has_control reason
## <chr> <chr> <int> <lgl> <chr>
## 1 BO1028 Cattle 1 TRUE Only one diet available (need ≥ 2)
## 2 CJ1006 Cattle 1 TRUE Only one diet available (need ≥ 2)
## 3 DK0001 Pigs 1 TRUE Only one diet available (need ≥ 2)
## 4 EM1069 Chicken 1 TRUE Only one diet available (need ≥ 2)
## 5 HK0337 Goat 1 TRUE Only one diet available (need ≥ 2)
## 6 JS0172 Goat 1 TRUE Only one diet available (need ≥ 2)
## 7 JS0201.2 Cattle 1 TRUE Only one diet available (need ≥ 2)
## 8 JS0345 Cattle 1 TRUE Only one diet available (need ≥ 2)
## 9 LM0124 Cattle 1 TRUE Only one diet available (need ≥ 2)
## 10 LM0227.1 Cattle 1 TRUE Only one diet available (need ≥ 2)
no_change_papers %>% head(10)
## # A tibble: 7 × 5
## B.Code P.Product n_pairs n_no_change all_no_change
## <chr> <chr> <int> <int> <lgl>
## 1 BO1012 Not Found 4 4 TRUE
## 2 EM1062 Chicken 3 3 TRUE
## 3 NN0207.1 Sheep 3 3 TRUE
## 4 NN0207.2 Cattle 3 3 TRUE
## 5 BO1008 Sheep 2 2 TRUE
## 6 JS0350 Sheep 1 1 TRUE
## 7 NN0092.2 Goat 1 1 TRUE
summary_unclassified
## # A tibble: 2 × 2
## reason n_papers
## <chr> <int>
## 1 Only one diet available (need ≥ 2) 16
## 2 No control diet flagged in ctrl_key 12
summary_no_change_by_prod
## # A tibble: 5 × 2
## P.Product n_papers_all_no_change
## <chr> <int>
## 1 Cattle 1
## 2 Chicken 1
## 3 Goat 1
## 4 Not Found 1
## 5 Sheep 3
# B.Codes that appear in the practice table
practice_codes <- practice_tbl %>%
distinct(B.Code)
# Intersect with your ≥2-diets papers
practice_in_2plus <- practice_codes %>%
semi_join(papers_2plus %>% select(B.Code), by = "B.Code")
# Attach species and count by product
practice_in_2plus_by_product <- practice_in_2plus %>%
left_join(
merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle","Sheep","Goat")) %>%
distinct(B.Code, P.Product),
by = "B.Code"
) %>%
count(P.Product, name = "Papers_with_practice_and_≥2_classified_diets")
# Show the summary
knitr::kable(
practice_in_2plus_by_product,
caption = "Papers per product that are in practice_tbl AND have ≥2 fully classified diets"
)
| P.Product | Papers_with_practice_and_≥2_classified_diets |
|---|---|
| Cattle | 12 |
| Goat | 18 |
| Sheep | 34 |
# (Optional) list the actual codes
knitr::kable(
practice_in_2plus %>%
left_join(
merged_metadata$`Prod.Out` %>%
distinct(B.Code, P.Product),
by = "B.Code"
) %>%
arrange(P.Product, B.Code),
caption = "B.Codes with ≥2 fully classified diets that also have a practice"
)
| B.Code | P.Product |
|---|---|
| BO1033 | Cattle |
| EM1093 | Cattle |
| HK0086 | Cattle |
| JS0205 | Cattle |
| JS0215.1 | Cattle |
| JS0215.2 | Cattle |
| JS0324.1 | Cattle |
| JS0324.2 | Cattle |
| LM0101 | Cattle |
| NJ0025 | Cattle |
| NN0207.2 | Cattle |
| NN0280.2 | Cattle |
| BO1015 | Goat |
| BO1076 | Goat |
| BO1098 | Goat |
| EO0057 | Goat |
| HK0107 | Goat |
| HK0171 | Goat |
| HK0182 | Goat |
| HK0350 | Goat |
| JS0192 | Goat |
| JS0193 | Goat |
| JS0196 | Goat |
| JS0197 | Goat |
| JS0208 | Goat |
| JS0261 | Goat |
| LM0248 | Goat |
| LM0280 | Goat |
| NN0092.2 | Goat |
| NN0375 | Goat |
| BO1015 | Not Found |
| 0 | Sheep |
| AN0007 | Sheep |
| BO1008 | Sheep |
| BO1025 | Sheep |
| CJ1015 | Sheep |
| EM1023 | Sheep |
| EM1034 | Sheep |
| EM1085 | Sheep |
| EM1090 | Sheep |
| EO0046 | Sheep |
| HK0033 | Sheep |
| HK0082 | Sheep |
| JO1082 | Sheep |
| JS0210.1 | Sheep |
| JS0210.2 | Sheep |
| JS0249 | Sheep |
| JS0262 | Sheep |
| JS0273 | Sheep |
| JS0285 | Sheep |
| JS0291 | Sheep |
| JS0301 | Sheep |
| JS0316 | Sheep |
| JS0344 | Sheep |
| JS0350 | Sheep |
| JS0419 | Sheep |
| NJ0008 | Sheep |
| NJ0032 | Sheep |
| NJ0044 | Sheep |
| NN0081 | Sheep |
| NN0200 | Sheep |
| NN0207.1 | Sheep |
| NN0212 | Sheep |
| NN0234 | Sheep |
| NN0329 | Sheep |
suppressPackageStartupMessages({
library(dplyr); library(tidyr); library(stringr)
})
# --- Roman → full name (keeps non-Roman as-is) -----------------------------
roman_to_name <- function(x) dplyr::recode(
x,
"I" = "High-starch",
"II" = "High-fat",
"III"= "Protein",
"IV" = "Good forage",
"V" = "Poor forage",
"VI" = "Mineral/Vitamin",
.default = x, .missing = NA_character_
)
# --- Normalise item names to improve join hit-rate -------------------------
norm_item <- function(x){
x %>%
stringr::str_to_lower() %>%
stringr::str_replace_all("[[:space:]]+", " ") %>%
stringr::str_trim() %>%
# a couple of handy harmonisations seen in your data:
stringr::str_replace("^unspecified\\s+molasses$", "molasses") %>%
stringr::str_replace_all("\\bbran\\b", "offal")
}
# --- 1) Ingredient lookup (D.Item -> Ingredient_Category) ------------------
stopifnot(all(c("D.Item","Ingredient_Category") %in% names(classif_ing_simple)))
ing_lu <- classif_ing_simple %>%
transmute(
D.Item_norm = norm_item(D.Item),
Cat_ing = roman_to_name(Ingredient_Category)
) %>%
filter(!is.na(D.Item_norm), D.Item_norm != "") %>%
group_by(D.Item_norm) %>%
summarise(Cat_ing = first(na.omit(Cat_ing)), .groups = "drop")
# --- 2) Per-arm diet category = top-contributing ingredient type ----------
# expects: diet_percentages has B.Code, A.Level.Name, D.Item, Percentage_of_Diet
stopifnot(all(c("B.Code","A.Level.Name","D.Item","Percentage_of_Diet") %in% names(diet_percentages)))
arm_cat <- diet_percentages %>%
mutate(
D.Item_norm = norm_item(D.Item),
Percentage_of_Diet = coalesce(Percentage_of_Diet, 0)
) %>%
left_join(ing_lu, by = "D.Item_norm") %>%
# sum % by ingredient *category* within each arm
group_by(B.Code, A.Level.Name, Cat_ing) %>%
summarise(pct_cat = sum(Percentage_of_Diet, na.rm = TRUE), .groups = "drop") %>%
# pick the max category per arm (ties broken deterministically)
group_by(B.Code, A.Level.Name) %>%
arrange(desc(pct_cat), Cat_ing, .by_group = TRUE) %>%
slice(1) %>%
ungroup() %>%
transmute(
B.Code, A.Level.Name,
Diet_Category = if_else(is.na(Cat_ing) | Cat_ing == "", "Unknown", Cat_ing)
)
diet_cat_lu <- distinct(arm_cat)
# (optional) quick diagnostic
# table(diet_cat_lu$Diet_Category, useNA = "ifany")
# --- 3) Control → Treatment switches (simple) ------------------------------
# ctrl_key must have B.Code, A.Level.Name, T.Control (TRUE/"yes" marks control)
stopifnot(all(c("B.Code","A.Level.Name","T.Control") %in% names(ctrl_key)))
ctrl_lu <- ctrl_key %>%
mutate(is_ctrl = ifelse(is.logical(T.Control), T.Control, tolower(T.Control) == "yes")) %>%
filter(is_ctrl) %>%
arrange(B.Code, A.Level.Name) %>%
group_by(B.Code) %>%
summarise(Control = first(A.Level.Name), .groups = "drop")
pairs <- diet_cat_lu %>%
distinct(B.Code, A.Level.Name) %>%
inner_join(ctrl_lu, by = "B.Code") %>%
filter(A.Level.Name != Control) %>%
transmute(B.Code, Control, Trt = A.Level.Name)
ctl <- diet_cat_lu %>% transmute(B.Code, Control = A.Level.Name, Cat_ctrl = Diet_Category)
trt <- diet_cat_lu %>% transmute(B.Code, Trt = A.Level.Name, Cat_trt = Diet_Category)
diet_switches <- pairs %>%
left_join(ctl, by = c("B.Code","Control")) %>%
left_join(trt, by = c("B.Code","Trt")) %>%
mutate(
Cat_ctrl = ifelse(is.na(Cat_ctrl) | Cat_ctrl=="", "Unknown", Cat_ctrl),
Cat_trt = ifelse(is.na(Cat_trt) | Cat_trt=="", "Unknown", Cat_trt),
Switch = paste0(Cat_ctrl, " \u2192 ", Cat_trt),
Practice_simple = dplyr::case_when(
Cat_ctrl == "Unknown" | Cat_trt == "Unknown" ~ "Unknown",
Cat_ctrl == Cat_trt ~ "Same category",
TRUE ~ "Category switch"
)
) %>%
arrange(B.Code, Control, Trt)
# counts + transition table
switch_counts <- diet_switches %>%
count(Cat_ctrl, Cat_trt, name = "n") %>%
arrange(desc(n))
transition_table <- switch_counts %>%
tidyr::pivot_wider(names_from = Cat_trt, values_from = n, values_fill = 0) %>%
arrange(Cat_ctrl)
# --- Results ready to use ---------------------------------------------------
# diet_switches # one row per trt vs control with Switch text
# switch_counts # tall counts of Cat_ctrl -> Cat_trt
# transition_table # wide matrix of transitions
here we will have the same issue as with feed intake data since the Alevel in the data tab dont always latch the ones in the diet tab
# Step 1: Make sure the key columns match (D.Item and A.Level.Name)
GE_combined_for_merge <- GE_combined %>%
rename(A.Level.Name = D.Item)%>%
select(-"ED.Mean.T")
# Step 2: Merge yield and emission data
yield_with_emissions <- yield_data %>%
left_join(
GE_combined_for_merge,
by = c("B.Code", "A.Level.Name")
)
## Warning in left_join(., GE_combined_for_merge, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 821 of `x` matches multiple rows in `y`.
## ℹ Row 360 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# distinct B.Code per product
bcode_per_product <- yield_with_emissions %>% # or yield_data
distinct(P.Product, B.Code) %>% # keep one row per pair
count(P.Product, name = "n_BCodes") # tally them
bcode_per_product
## Index: <Source>
## P.Product n_BCodes
## <char> <int>
## 1: Cattle 19
## 2: Goat 37
## 3: Sheep 25
## 4: <NA> 456
# 0) Yield papers (unique B.Codes that have yield linked)
yield_bcodes <- yield_with_emissions %>%
distinct(B.Code)
practice_with_yield <- practice_in_2plus %>%
semi_join(yield_bcodes, by = "B.Code")
# 3) Count by product (Cattle / Sheep / Goat)
practice_yield_by_product <- practice_with_yield %>%
left_join(
merged_metadata$`Prod.Out` %>%
filter(P.Product %in% c("Cattle","Sheep","Goat")) %>%
distinct(B.Code, P.Product),
by = "B.Code"
) %>%
count(P.Product, name = "Papers_with_practice_≥2_classified_and_yield")
# Filter meat yield data and drop missing values
yield_meat <- yield_with_emissions %>%
filter(Out.Subind == "Meat Yield") %>%
filter(!is.na(ED.Mean.T) & !is.na(CH4_kg_year) & !is.na(P.Product))
# Get axis limits
x_range <- range(yield_meat$ED.Mean.T, na.rm = TRUE)
y_range <- range(yield_meat$CH4_kg_year, na.rm = TRUE)
# Plot
ggplot(yield_meat, aes(x = ED.Mean.T, y = CH4_kg_year, color = P.Product)) +
geom_point(alpha = 0.8, size = 2.5) +
scale_x_log10(
limits = x_range,
breaks = scales::log_breaks(n = 5),
labels = scales::comma_format()
) +
scale_y_continuous(
limits = y_range,
labels = scales::comma_format()
) +
labs(
title = "CH4 Emissions vs. Meat Yield",
x = "Meat Yield per Individual (kg)",
y = "Annual CH4 Emissions (kg)",
color = "Livestock Species"
) +
theme_minimal() +
theme(legend.position = "bottom")
# Filter meat yield data and drop missing values
yield_milk <- yield_with_emissions %>%
filter(Out.Subind == "Milk Yield") %>%
filter(!is.na(ED.Mean.T) & !is.na(CH4_kg_year) & !is.na(P.Product))
# Get axis limits
x_range <- range(yield_milk$ED.Mean.T, na.rm = TRUE)
y_range <- range(yield_milk$CH4_kg_year, na.rm = TRUE)
# Plot
ggplot(yield_milk, aes(x = ED.Mean.T, y = CH4_kg_year, color = P.Product)) +
geom_point(alpha = 0.8, size = 2.5) +
scale_x_log10(
limits = x_range,
breaks = scales::log_breaks(n = 5),
labels = scales::comma_format()
) +
scale_y_continuous(
limits = y_range,
labels = scales::comma_format()
) +
labs(
title = "CH4 Emissions vs. Meat Yield",
x = "Milk Yield per Individual (kg)",
y = "Annual CH4 Emissions (kg)",
color = "Livestock Species"
) +
theme_minimal() +
theme(legend.position = "bottom")
# Filter and clean data
emissions_by_species <- yield_with_emissions %>%
filter(!is.na(CH4_kg_year), !is.na(P.Product))
# Boxplot with transparent points
ggplot(emissions_by_species, aes(x = P.Product, y = CH4_kg_year, fill = P.Product)) +
geom_boxplot(outlier.shape = NA, alpha = 0.4) + # lighter fill and no outliers
geom_jitter(width = 0.2, alpha = 0.5, color = "black", size = 2) + # transparent points
labs(
title = "Annual CH4 Emissions by Livestock Species",
x = "Species",
y = "Annual CH4 Emissions (kg)"
) +
theme_minimal() +
theme(legend.position = "none") # remove redundant legend
# Filter and clean data for Goat and Sheep only
emissions_by_species <- yield_with_emissions %>%
filter(!is.na(CH4_kg_year), P.Product %in% c("Goat", "Sheep"))
# Boxplot with transparent points for Goat and Sheep
ggplot(emissions_by_species, aes(x = P.Product, y = CH4_kg_year, fill = P.Product)) +
geom_boxplot(outlier.shape = NA, alpha = 0.4) +
geom_jitter(width = 0.2, alpha = 0.5, color = "black", size = 2) +
labs(
title = "Annual CH4 Emissions: Goat vs. Sheep",
x = "Species",
y = "Annual CH4 Emissions (kg)"
) +
theme_minimal() +
theme(legend.position = "none")
## 1) Live‑weight table --------------------------------------------------------
bw_tbl <- weights_unique %>%
select(B.Code, LiveWeight_kg = Out.WG.Start) %>%
filter(!is.na(LiveWeight_kg))
## 2) Merge with CH₄ and keep P.Product already present -----------------------
ch4_vs_bw <- GE_combined %>% # contains P.Product
filter(!is.na(CH4_kg_year)) %>%
left_join(bw_tbl, by = "B.Code") %>%
filter(!is.na(LiveWeight_kg) & LiveWeight_kg > 0)
## Warning in left_join(., bw_tbl, by = "B.Code"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## 3) Plot ---------------------------------------------------------------------
##########################################################
## CH4 vs. live‑weight with 95 % confidence intervals ##
##########################################################
ggplot(ch4_vs_bw,
aes(x = LiveWeight_kg, y = CH4_kg_year, colour = P.Product)) +
geom_point(alpha = 0.65, size = 2.3) +
geom_smooth(method = "lm",
formula = y ~ log10(x), # linear in log‑weight
se = TRUE, # << confidence ribbon
linetype = "dashed",
linewidth = 1) +
scale_x_log10(breaks = scales::log_breaks(), labels = scales::comma) +
labs(
title = "Annual CH4 emissions vs. live weight",
x = "Body weight (kg, log scale)",
y = "CH₄ (kg animal⁻¹ year⁻¹)",
colour = "Species"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
wg<-merged_metadata$Data.Out%>%filter(Out.Subind=="Weight Gain")
###############################################################################
## STANDARDISE WEIGHT-GAIN TO kg animal⁻¹ day⁻¹ (no totals, no wk/mo) ##
###############################################################################
wg_clean <- wg %>%
## 1. normalise text ---------------------------------------------------------
mutate(
unit_raw = tolower(trimws(Out.Unit)),
value = as.numeric(ED.Mean.T) # be sure it's numeric
) %>%
## 2. discard relative indices (% IBW, % SGR) -------------------------------
filter(
!str_detect(unit_raw, "% ibw") &
!str_detect(unit_raw, "% sgr")
) %>%
## 3. keep only entries that already contain “/d” or “/day” ------------------
filter(str_detect(unit_raw, "/d") | str_detect(unit_raw, "/day") |
str_detect(unit_raw, "g/kg metabolic weight")) %>%
## 4. parse amount unit ------------------------------------------------------
mutate(
amount_unit = case_when(
str_starts(unit_raw, "mg") ~ "mg",
str_starts(unit_raw, "g") ~ "g",
str_starts(unit_raw, "kg") ~ "kg",
TRUE ~ NA_character_
),
gain_kg = case_when(
amount_unit == "mg" ~ value / 1e6,
amount_unit == "g" ~ value / 1e3,
amount_unit == "kg" ~ value,
TRUE ~ NA_real_
)
) %>%
## 5. handle “g/kg metabolic weight” -----------------------------------------
left_join(
weights_unique %>%
select(B.Code, A.Level.Name, BW_start_kg = Out.WG.Start),
by = c("B.Code", "A.Level.Name")
) %>%
mutate(
ADG_kg = case_when(
str_detect(unit_raw, "g/kg metabolic weight") &
!is.na(value) & !is.na(BW_start_kg)
~ (value / 1000) * (BW_start_kg ^ 0.75), # g → kg × BW^0.75
TRUE ~ gain_kg # already kg/d
)
) %>%
## 6. final screen -----------------------------------------------------------
filter(!is.na(ADG_kg) & ADG_kg > 0) %>%
select(B.Code, A.Level.Name, ADG_kg)
############################################################
## EMISSION INTENSITY (kg CH₄ kg-¹ live-weight gain) ##
############################################################
## 1. merge CH₄ and ADG ------------------------------------
ei_df <- GE_combined %>% # CH₄ table
select(B.Code, A.Level.Name = D.Item,
CH4_kg_year, P.Product) %>%
inner_join(
wg_clean, # weight-gain table
by = c("B.Code", "A.Level.Name")
) %>%
## 2. compute emission intensity -------------------------
mutate(
gain_kg_year = ADG_kg * 365, # kg yr-¹
EI_CH4_kg = CH4_kg_year / gain_kg_year
) %>%
## 3. basic QC ------------------------------------------
filter(
!is.na(EI_CH4_kg) & EI_CH4_kg > 0, # remove missing / negative
EI_CH4_kg < 1 # drop extreme outliers (>1 kg CH₄ kg-¹ LW-gain)
)
## Warning in inner_join(., wg_clean, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 454 of `x` matches multiple rows in `y`.
## ℹ Row 599 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
length(unique(ei_df$B.Code))
## [1] 27
# A) box-and-jitter by species -----------------------------------------------
ggplot(ei_df, aes(x = P.Product, y = EI_CH4_kg, fill = P.Product)) +
geom_boxplot(outlier.shape = NA, alpha = 0.35) +
geom_jitter(width = 0.15, alpha = 0.65, size = 2) +
scale_y_continuous("Emission intensity (kg CH₄ kg⁻¹ LW-gain)",
labels = scales::comma_format()) +
labs(title = "Enteric-methane intensity of growth",
x = NULL) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
# B) relationship between growth rate and EI ---------------------------------
ggplot(ei_df, aes(x = ADG_kg, y = EI_CH4_kg, colour = P.Product)) +
geom_point(alpha = 0.7, size = 2.3) +
geom_smooth(method = "lm", se = TRUE, linetype = "dashed") +
scale_x_log10("Average daily gain (kg animal⁻¹ day⁻¹)",
breaks = scales::log_breaks(),
labels = scales::comma_format()) +
scale_y_continuous("Emission intensity (kg CH₄ kg⁻¹ LW-gain)",
labels = scales::comma_format()) +
labs(title = "Faster-growing animals generally emit less CH₄ per kg gain",
colour = "Species") +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
library(dplyr); library(ggplot2); library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:purrr':
##
## discard
# 0. Make sure your weight-gain table is available
# (this is the object you built in the earlier step)
adg_clean <- wg_clean %>% # <- rename for clarity
select(B.Code, A.Level.Name, ADG_kg) # keep only the essentials
# 1. Bring EI together with the absolute-CH₄ table you already built
ei_tbl <- adg_clean %>% # has A.Level.Name
inner_join(
GE_combined,
by = c("B.Code" = "B.Code", # same name
"A.Level.Name" = "D.Item") # different names
) %>%
mutate(
EI = CH4_kg_year / (ADG_kg * 365) # kg CH₄ kg⁻¹ LW-gain
)
# 2. Median split lines (or choose any threshold you like)
vline <- median(ei_tbl$CH4_kg_year, na.rm = TRUE)
hline <- median(ei_tbl$EI, na.rm = TRUE)
x_lim <- range(ei_tbl$ADG_kg, na.rm = TRUE) # e.g. 0.002 – 0.8
y_lim <- range(ei_tbl$EI, na.rm = TRUE) # e.g. 0.005 – 1.2
ch4_lim <- range(ei_tbl$CH4_kg_year, na.rm = TRUE)
# 3. Plot ----------------------------------------------------------------
ggplot(ei_tbl,
aes(x = ADG_kg,
y = EI,
colour = CH4_kg_year,
size = CH4_kg_year)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, colour = "black") +
scale_x_log10(limits = x_lim,
breaks = log_breaks(),
labels = comma_format()) +
scale_y_log10(limits = y_lim,
breaks = c(.005, .01, .03, .1, .3, 1),
labels = comma_format()) +
scale_colour_viridis_c(option = "plasma",
trans = "log10",
limits = ch4_lim,
name = "Annual CH₄ (kg)") +
scale_size(range = c(2, 6),
trans = "log10",
limits = ch4_lim,
guide = "none") +
facet_wrap(~ P.Product) + # same axes in every facet
labs(title = "Faster-growing animals emit less CH4 per kg gain",
subtitle = "Axes are identical across cattle, goat and sheep panels",
x = "Average daily gain (kg animal⁻¹ day⁻¹, log scale)",
y = "Emission intensity (kg CH4 kg⁻¹ live-weight gain)",
caption = "Point colour & size scale with absolute annual CH4 per animal") +
theme_minimal(base_size = 12) +
theme(legend.position = "right")
## `geom_smooth()` using formula = 'y ~ x'
##############################################
## 1. PREPARE DAILY YIELD (kg animal-1 d-1)
##############################################
## density of milk for L → kg conversion
MILK_DENSITY <- 1.03 # kg L-1
yield_d_clean <- yield_data %>% # ← your harmonised table
filter(Out.Subind %in% c("Milk Yield", "Meat Yield")) %>%
mutate(
Yield_kg_day = case_when(
# ───────────── Milk ─────────────────────────────────────────────
Out.Subind == "Milk Yield" & Out.Unit == "l/individual/day" ~ ED.Mean.T * MILK_DENSITY,
Out.Subind == "Milk Yield" & Out.Unit == "kg/individual/day" ~ ED.Mean.T,
# ───────────── Meat (already kg day-1) ─────────────────────────
Out.Subind == "Meat Yield" & Out.Unit == "kg/individual/day" ~ ED.Mean.T,
# ───────────── Everything else (drop) ─────────────────────────
TRUE ~ NA_real_
)
) %>%
filter(!is.na(Yield_kg_day) & Yield_kg_day > 0) %>% # keep usable rows only
select(B.Code, A.Level.Name, Out.Subind, Yield_kg_day)
###########################################################################
## 2. COMPUTE EMISSION-INTENSITY (kg CH4 per kg *product*) #############
###########################################################################
ei_prod <- GE_combined %>% # absolute CH4 table
select(B.Code, A.Level.Name = D.Item,
CH4_kg_year, P.Product,T.Control) %>%
inner_join(yield_d_clean, by = c("B.Code","A.Level.Name")) %>%
mutate(
Yield_kg_year = Yield_kg_day * 365,
EI_CH4_prod = CH4_kg_year / Yield_kg_year,
Product = if_else(Out.Subind == "Milk Yield", "Milk", "Meat")
) %>%
filter(EI_CH4_prod > 0, EI_CH4_prod < 10) # drop negatives / crazy outliers
## Warning in inner_join(., yield_d_clean, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 360 of `x` matches multiple rows in `y`.
## ℹ Row 2309 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#########################################
## 3. QUICK DIAGNOSTIC PLOTS ##########
#########################################
# ── A) Box-and-jitter by species & product ───────────────────────────────
ggplot(ei_prod, aes(x = P.Product, y = EI_CH4_prod,
fill = P.Product)) +
geom_boxplot(outlier.shape = NA, alpha = .35) +
geom_jitter(width = .15, alpha = .65, size = 2) +
scale_y_log10("Emission-intensity (kg CH₄ kg⁻¹ product)",
breaks = scales::log_breaks(),
labels = scales::comma_format()) +
facet_wrap(~ Product, nrow = 1) +
labs(title = "Enteric-methane intensity of milk and meat production",
x = NULL) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
# ── B) Relationship between yield and EI ────────────────────────────────
ggplot(ei_prod,
aes(x = Yield_kg_day,
y = EI_CH4_prod,
colour = P.Product)) +
geom_point(alpha = .7, size = 2.3) +
geom_smooth(method = "lm", se = TRUE, linetype = "dashed") +
scale_x_log10("Yield (kg animal⁻¹ day⁻¹, log scale)",
breaks = scales::log_breaks(),
labels = scales::comma_format()) +
scale_y_log10("Emission-intensity (kg CH₄ kg⁻¹ product)",
breaks = scales::log_breaks(),
labels = scales::comma_format()) +
facet_wrap(~ Product, nrow = 1) +
labs(title = "Higher-yielding animals emit less CH₄ per kg of product",
colour = "Species") +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
length(unique(ei_prod$B.Code))
## [1] 44
suppressPackageStartupMessages({
library(dplyr); library(tidyr); library(stringr); library(ggplot2); library(scales)
})
# ---------- 0) Pick the EI/CH4 fields you want ------------------------------
ei_core <- ei_prod %>%
select(B.Code,
A.Level.Name,
EI_CH4_prod, # emission intensity (kg CH4 / kg product)
CH4_kg_year, # absolute (if available)
Product, P.Product,
T.Control, # "yes"/"no" or TRUE/FALSE
Yield_kg_day, Yield_kg_year)
# ---------- 1) Join practice_tbl to EI on both arms --------------------------
# practice_tbl must have: B.Code, Control, Trt, Practice, Magnitude_pct, ΔTOTAL, Substitution_pairs, ...
stopifnot(all(c("B.Code","Control","Trt","Practice","Magnitude_pct") %in% names(practice_tbl)))
practice_ei <- practice_tbl %>%
# attach control EI
left_join(ei_core, by = c("B.Code","Control" = "A.Level.Name")) %>%
# attach treatment EI (suffix conflicts)
left_join(ei_core, by = c("B.Code","Trt" = "A.Level.Name"),
suffix = c("_ctrl", "_trt")) %>%
# rename the key metrics so they're easier to read
rename(
EI_ctrl = EI_CH4_prod_ctrl,
EI_trt = EI_CH4_prod_trt,
CH4_ctrl = CH4_kg_year_ctrl,
CH4_trt = CH4_kg_year_trt,
Yield_day_ctrl = Yield_kg_day_ctrl,
Yield_day_trt = Yield_kg_day_trt,
Yield_year_ctrl = Yield_kg_year_ctrl,
Yield_year_trt = Yield_kg_year_trt
) %>%
mutate(
# deltas: + means treatment is worse/higher
delta_EI_prod = EI_trt - EI_ctrl,
delta_CH4_year = CH4_trt - CH4_ctrl,
delta_Yield_day = Yield_day_trt - Yield_day_ctrl,
delta_Yield_yr = Yield_year_trt - Yield_year_ctrl,
# best-available product/species label
Product = coalesce(Product_trt, Product_ctrl, P.Product_trt, P.Product_ctrl)
) %>%
filter(!is.na(delta_EI_prod)) # keep only pairs with EI on both sides
## Warning in left_join(., ei_core, by = c("B.Code", Control = "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 3 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning in left_join(., ei_core, by = c("B.Code", Trt = "A.Level.Name"), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 17 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Quick species count (how many papers per species)
species_counts <- practice_ei %>%
distinct(B.Code, Species = coalesce(P.Product_ctrl, P.Product_trt, Product_ctrl, Product_trt)) %>%
count(Species, name = "n_papers") %>%
arrange(desc(n_papers))
# ---------- 2) Simple summaries ---------------------------------------------
# (a) By practice (Addition/Substitution/No change)
summary_by_practice <- practice_ei %>%
group_by(Practice) %>%
summarise(
n_pairs = n(),
mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
median_ΔEI= median(delta_EI_prod, na.rm = TRUE),
mean_mag = mean(Magnitude_pct, na.rm = TRUE),
.groups = "drop"
)
# (b) By practice × species
summary_by_practice_species <- practice_ei %>%
group_by(Product, Practice) %>%
summarise(
n_pairs = n(),
mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
median_ΔEI= median(delta_EI_prod, na.rm = TRUE),
mean_mag = mean(Magnitude_pct, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(Product, Practice)
# ---------- 3) Plots (practice-level) ---------------------------------------
# 3a) Scatter: magnitude vs ΔEI by practice
p_scatter <- ggplot(practice_ei,
aes(Magnitude_pct, delta_EI_prod, colour = Practice)) +
geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
geom_point(alpha = .7, size = 2) +
geom_smooth(method = "lm", se = TRUE, colour = "black", formula = y ~ x) +
scale_x_continuous("% of diet DM moved (practice magnitude)") +
scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
facet_wrap(~ Product) +
theme_bw() +
theme(legend.position = "bottom")
# 3b) Violin/box: ΔEI by practice
p_violin <- ggplot(practice_ei,
aes(Practice, delta_EI_prod, fill = Practice)) +
geom_violin(trim = FALSE, alpha = .35, colour = NA) +
geom_boxplot(width = .18, outlier.shape = NA) +
stat_summary(fun = median, geom = "point", size = 2, colour = "black") +
geom_hline(yintercept = 0, linetype = 2, colour = "grey50") +
scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
facet_wrap(~ Product, nrow = 1) +
labs(x = NULL, title = "Emission-intensity shift by practice") +
theme_bw() +
theme(legend.position = "none")
# 3c) Trade-off: ΔYield vs ΔEI; size = magnitude
p_tradeoff <- ggplot(practice_ei,
aes(delta_Yield_day, delta_EI_prod,
colour = Practice,
size = Magnitude_pct)) +
geom_vline(xintercept = 0, linetype = 2, colour = "grey60") +
geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
geom_point(alpha = .75) +
scale_size(range = c(2,6), name = "% moved") +
scale_x_continuous("Δ Yield (kg animal⁻¹ d⁻¹)", labels = comma) +
scale_y_continuous("Δ EI (kg CH₄ / kg product)", labels = comma) +
facet_wrap(~ Product) +
labs(title = "Productivity vs emission-intensity trade-offs, by practice") +
theme_minimal() +
theme(legend.position = "bottom")
# Print them as needed:
# p_scatter; p_violin; p_tradeoff
# ---------- 4) Optional: look inside Substitution pairs ---------------------
# Uses the text column `Substitution_pairs` built in practice_tbl, e.g. "Forage → Concentrate; X → Y"
subs_pairs <- practice_ei %>%
filter(Practice == "Substitution") %>%
select(B.Code, Control, Trt, Product, Magnitude_pct, delta_EI_prod, Substitution_pairs) %>%
mutate(Substitution_pairs = ifelse(is.na(Substitution_pairs), "", Substitution_pairs)) %>%
filter(Substitution_pairs != "") %>%
separate_rows(Substitution_pairs, sep = ";\\s*") %>%
mutate(
left = str_trim(str_split_fixed(Substitution_pairs, "→", 2)[,1]),
right = str_trim(str_split_fixed(Substitution_pairs, "→", 2)[,2])
) %>%
filter(left != "", right != "") %>%
mutate(pair_id = paste(left, "→", right))
# quick tally and mean ΔEI per pair (overall and by species)
subs_pair_summary <- subs_pairs %>%
group_by(pair_id) %>%
summarise(n = n(),
mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
mean_mag = mean(Magnitude_pct, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(n))
subs_pair_by_product <- subs_pairs %>%
group_by(Product, pair_id) %>%
summarise(n = n(),
mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
mean_mag = mean(Magnitude_pct, na.rm = TRUE),
.groups = "drop") %>%
arrange(Product, desc(n))
# Example plot: ΔEI vs % moved, coloured by pair direction
p_subs <- ggplot(subs_pairs,
aes(x = delta_EI_prod, y = Magnitude_pct,
colour = pair_id)) +
geom_vline(xintercept = 0, linetype = 2, colour = "grey60") +
geom_point(alpha = .8) +
facet_wrap(~ Product) +
scale_y_continuous("% of diet DM substituted") +
scale_x_continuous("Δ EI (kg CH₄ / kg product)", breaks = pretty_breaks()) +
labs(title = "Substitution pairs: direction & intensity vs ΔEI",
colour = "Removed → Added") +
theme_bw() +
theme(legend.position = "right")
# ---------- 5) A compact “effect” label (optional) --------------------------
EI_THRESH <- 0.02 # tweak as you like (kg CH4 / kg product)
practice_ei <- practice_ei %>%
mutate(effect = case_when(
delta_EI_prod <= -EI_THRESH ~ "Lower EI",
delta_EI_prod >= EI_THRESH ~ "Higher EI",
TRUE ~ "Neutral"
))
effect_by_practice_species <- practice_ei %>%
group_by(Product, Practice, effect) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Product, Practice) %>%
mutate(pct = 100*n / sum(n)) %>%
ungroup() %>%
arrange(Product, Practice, desc(pct))
# View the main outputs:
species_counts
## # A tibble: 3 × 2
## Species n_papers
## <chr> <int>
## 1 Goat 9
## 2 Sheep 8
## 3 Cattle 3
summary_by_practice
## # A tibble: 3 × 5
## Practice n_pairs mean_ΔEI median_ΔEI mean_mag
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Addition 113 0.0000412 0.0000104 27.2
## 2 No change 42 0.0431 0.00256 0
## 3 Substitution 268 -0.00232 -0.00277 107.
summary_by_practice_species
## # A tibble: 6 × 6
## Product Practice n_pairs mean_ΔEI median_ΔEI mean_mag
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Meat Addition 107 -0.000965 0.0000104 27.1
## 2 Meat No change 9 -0.000900 -0.000619 0
## 3 Meat Substitution 17 -0.00614 -0.00000535 79.7
## 4 Milk Addition 6 0.0180 0.00456 29.2
## 5 Milk No change 33 0.0551 0.00256 0
## 6 Milk Substitution 251 -0.00206 -0.00277 109.
subs_pair_summary
## # A tibble: 9 × 4
## pair_id n mean_ΔEI mean_mag
## <chr> <int> <dbl> <dbl>
## 1 II → Unclassified 10 -0.000120 128.
## 2 I → II 8 -0.00404 6.22
## 3 II → VI 8 -0.00404 6.22
## 4 II → IV 3 -0.000389 26.1
## 5 I → III 1 -0.00129 36.8
## 6 IV → II 1 -0.00129 36.8
## 7 Unclassified → IV 1 -0.0000137 32.4
## 8 Unclassified → VI 1 -0.00000123 30
## 9 V → Unclassified 1 -0.0000200 10
subs_pair_by_product
## # A tibble: 13 × 5
## Product pair_id n mean_ΔEI mean_mag
## <chr> <chr> <int> <dbl> <dbl>
## 1 Meat II → Unclassified 9 -0.00000535 138
## 2 Meat I → II 4 -0.0261 6.22
## 3 Meat II → VI 4 -0.0261 6.22
## 4 Meat II → IV 2 -0.00000395 22.5
## 5 Meat Unclassified → IV 1 -0.0000137 32.4
## 6 Meat Unclassified → VI 1 -0.00000123 30
## 7 Meat V → Unclassified 1 -0.0000200 10
## 8 Milk I → II 4 0.0180 6.22
## 9 Milk II → VI 4 0.0180 6.22
## 10 Milk I → III 1 -0.00129 36.8
## 11 Milk II → IV 1 -0.00116 33.3
## 12 Milk II → Unclassified 1 -0.00115 33.3
## 13 Milk IV → II 1 -0.00129 36.8
p_scatter; p_violin; p_tradeoff; p_subs
suppressPackageStartupMessages({
library(dplyr); library(tidyr); library(stringr); library(ggplot2); library(scales)
})
# -------- helpers -------------------------------------------------------------
norm_key <- function(x){
x %>% tolower() %>% stringr::str_replace_all("[[:space:]]+", " ") %>% trimws()
}
# robust find of WG/ADG subinds (handles many name variants)
is_wg_subind <- function(x){
str_detect(
tolower(x),
"(^|\\b)(weight\\s*gain|daily\\s*average\\s*weight\\s*gain|average\\s*daily\\s*weight\\s*gain|adg)(\\b|$)"
)
}
# -------- 1) STANDARDISE WEIGHT GAIN to kg/day --------------------------------
# Source: merged_metadata$Data.Out (your harmonised outcomes table)
# Uses weights_unique for BW^0.75 cases (g/kg metabolic weight).
stopifnot(exists("merged_metadata"), "Data.Out" %in% names(merged_metadata))
stopifnot(exists("weights_unique"))
wg_raw <- merged_metadata$Data.Out %>%
filter(!is.na(Out.Subind)) %>%
filter(is_wg_subind(Out.Subind)) %>%
mutate(
unit_raw = tolower(str_squish(Out.Unit)),
value = suppressWarnings(as.numeric(ED.Mean.T))
) %>%
# keep only per-day measures or metabolic-weight forms
filter(
str_detect(unit_raw, "/d|/day| per day") |
str_detect(unit_raw, "g/kg metabolic weight")
) %>%
mutate(
mass_unit = case_when(
str_starts(unit_raw, "mg") ~ "mg",
str_starts(unit_raw, "g") ~ "g",
str_starts(unit_raw, "kg") ~ "kg",
TRUE ~ NA_character_
),
gain_kg_day = case_when(
mass_unit == "mg" ~ value / 1e6,
mass_unit == "g" ~ value / 1e3,
mass_unit == "kg" ~ value,
TRUE ~ NA_real_
)
) %>%
# for g/kg metabolic weight, convert using BW^0.75
left_join(
weights_unique %>%
transmute(B.Code, A.Level.Name, BW_start_kg = as.numeric(Out.WG.Start)),
by = c("B.Code","A.Level.Name")
) %>%
mutate(
WG_kg_day_std = case_when(
str_detect(unit_raw, "g/kg metabolic weight") &
!is.na(value) & !is.na(BW_start_kg) ~ (value/1000) * (BW_start_kg^0.75),
TRUE ~ gain_kg_day
),
# normalised diet-arm key for safer joining later
A_norm = norm_key(A.Level.Name)
) %>%
filter(!is.na(WG_kg_day_std) & WG_kg_day_std > 0) %>%
select(B.Code, A.Level.Name, A_norm, WG_kg_day_std)
# -------- 2) CH4 table with normalised arm names ------------------------------
# GE_combined has CH4_kg_year and diet arm under D.Item
ch4_tbl <- GE_combined %>%
transmute(
B.Code,
A.Level.Name = D.Item,
A_norm = norm_key(D.Item),
CH4_kg_year = as.numeric(CH4_kg_year),
P.Product,
T.Control = ifelse(is.na(T.Control), NA_character_, T.Control)
) %>%
filter(!is.na(CH4_kg_year))
# -------- 3) Build EI for weight gain (ei_wg) ---------------------------------
ei_wg <- ch4_tbl %>%
inner_join(wg_raw, by = c("B.Code","A_norm")) %>%
mutate(
WG_kg_year = WG_kg_day_std * 365,
EI_CH4_WG = CH4_kg_year / WG_kg_year
) %>%
filter(!is.na(EI_CH4_WG), EI_CH4_WG > 0, EI_CH4_WG < 1) %>% # trim extremes
select(B.Code,
A.Level.Name = A.Level.Name.x, # keep CH4-side label
CH4_kg_year, P.Product, T.Control,
WG_kg_day_std, WG_kg_year, EI_CH4_WG)
## Warning in inner_join(., wg_raw, by = c("B.Code", "A_norm")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 109 of `x` matches multiple rows in `y`.
## ℹ Row 1763 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# (Optional) quick diagnostics: unmatched WG or CH4
wg_unmatched <- anti_join(wg_raw, ch4_tbl, by = c("B.Code","A_norm")) %>% distinct(B.Code, A.Level.Name)
ch4_unmatched <- anti_join(ch4_tbl, wg_raw, by = c("B.Code","A_norm")) %>% distinct(B.Code, A.Level.Name)
# View(wg_unmatched); View(ch4_unmatched)
# -------- 4) Join EI_WG to practice_tbl comparisons --------------------------
# Expect practice_tbl with columns: B.Code, Control, Trt, Practice, Magnitude_pct (from your earlier code)
stopifnot(exists("practice_tbl"))
ei_core <- ei_wg %>%
select(B.Code, A.Level.Name, P.Product, T.Control,
WG_kg_day_std, WG_kg_year, EI_CH4_WG, CH4_kg_year)
wg_pairs <- practice_tbl %>%
transmute(B.Code, Control, Trt, Practice, Magnitude_pct) %>%
# attach A (control)
left_join(ei_core,
by = c("B.Code","Control" = "A.Level.Name"),
multiple = "first") %>%
rename(
P.Product_A = P.Product,
T.Control_A = T.Control,
WG_kg_day_A = WG_kg_day_std,
WG_kg_year_A = WG_kg_year,
EI_WG_A = EI_CH4_WG,
CH4_kg_year_A = CH4_kg_year
) %>%
# attach B (treatment)
left_join(ei_core,
by = c("B.Code","Trt" = "A.Level.Name"),
multiple = "first") %>%
rename(
P.Product_B = P.Product,
T.Control_B = T.Control,
WG_kg_day_B = WG_kg_day_std,
WG_kg_year_B = WG_kg_year,
EI_WG_B = EI_CH4_WG,
CH4_kg_year_B = CH4_kg_year
) %>%
mutate(
Species = coalesce(P.Product_B, P.Product_A),
delta_EI_WG = EI_WG_B - EI_WG_A, # (+) = worse intensity
delta_WG_day = WG_kg_day_B - WG_kg_day_A, # change in ADG
delta_CH4_year = CH4_kg_year_B - CH4_kg_year_A
) %>%
filter(!is.na(EI_WG_A), !is.na(EI_WG_B))
# -------- 5) A couple of ready-to-run summaries/plots ------------------------
# A) Box/violin: EI shift by practice type (all species together)
ggplot(wg_pairs,
aes(x = Practice, y = delta_EI_WG, fill = Practice)) +
geom_violin(trim = FALSE, alpha = .35, colour = NA) +
geom_boxplot(width = .15, outlier.shape = NA) +
geom_hline(yintercept = 0, linetype = 2, colour = "grey40") +
stat_summary(fun = median, geom = "point", size = 2, colour = "black") +
scale_y_continuous("Δ EI (kg CH₄ per kg LW-gain)",
labels = comma_format()) +
labs(x = NULL, title = "Change in CH₄ intensity of growth by practice") +
theme_bw() + theme(legend.position = "none")
# B) Scatter: magnitude of reformulation vs intensity shift (faceted by species)
ggplot(wg_pairs,
aes(x = Magnitude_pct, y = delta_EI_WG, colour = Practice)) +
geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
geom_point(alpha = .7, size = 2) +
geom_smooth(method = "lm", se = TRUE, colour = "black") +
scale_x_continuous("% of diet DM changed") +
scale_y_continuous("Δ EI (kg CH₄ per kg LW-gain)",
labels = comma_format()) +
facet_wrap(~ Species) +
theme_bw() + theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'
# C) Table: how many comparisons per species × practice
wg_pairs %>%
count(Species, Practice, name = "n_pairs") %>%
arrange(desc(n_pairs))
## # A tibble: 8 × 3
## Species Practice n_pairs
## <chr> <chr> <int>
## 1 Sheep Addition 22
## 2 Goat Addition 13
## 3 Goat Substitution 7
## 4 Cattle Addition 5
## 5 Sheep No change 5
## 6 Sheep Substitution 4
## 7 Cattle Substitution 2
## 8 Goat No change 1
suppressPackageStartupMessages({
library(dplyr); library(tidyr); library(stringr)
library(ggplot2); library(forcats); library(scales)
})
# ── config ────────────────────────────────────────────────────────────────────
MIN_N <- 3 # hide groups with <3 comparisons to keep plots clean
FACET_BY_SPECIES <- FALSE # set TRUE if you want species facets
# ── 0) Bring types onto the EI deltas (wg_pairs) ─────────────────────────────
stopifnot(exists("wg_pairs"), exists("practice_tbl"))
pairs_with_types <- wg_pairs %>%
left_join(
practice_tbl %>%
select(B.Code, Control, Trt, Practice,
Types_added, Types_decreased, Substitution_pairs),
by = c("B.Code","Control","Trt","Practice")
)
# ── 1) ADDITIONS: explode Types_added → one row per category added ───────────
added_long <- pairs_with_types %>%
filter(Practice == "Addition") %>%
mutate(Types_added = na_if(Types_added, "")) %>%
separate_rows(Types_added, sep = "\\s*,\\s*") %>%
mutate(Category_added = str_trim(Types_added)) %>%
filter(!is.na(Category_added),
Category_added != "", Category_added != "Unclassified")
# keep only categories with enough points
add_keep <- added_long %>%
count(Category_added, name = "n") %>%
filter(n >= MIN_N)
added_long <- added_long %>%
semi_join(add_keep, by = "Category_added") %>%
mutate(
x_lab = fct_reorder(
paste0(Category_added, " (n=", add_keep$n[match(Category_added, add_keep$Category_added)], ")"),
delta_EI_WG,
.fun = median, .desc = TRUE, na.rm = TRUE
)
)
# ── Plot A: ΔEI by CATEGORY ADDED ────────────────────────────────────────────
p_add <- ggplot(
added_long,
aes(x = delta_EI_WG, y = x_lab)
) +
geom_vline(xintercept = 0, linetype = 2, colour = "grey65") +
geom_boxplot(outlier.shape = NA, fill = "grey85") +
geom_point(alpha = .55, size = 1.8) +
scale_x_continuous("Δ EI (kg CH₄ per kg live-weight gain)", labels = comma) +
labs(y = NULL, title = "Change in emission intensity by category ADDED") +
theme_bw() +
theme(panel.grid.minor = element_blank())
if (FACET_BY_SPECIES) {
p_add <- p_add + facet_wrap(~ Species, ncol = 1, scales = "free_y")
}
p_add
# ── 2) SUBSTITUTIONS: explode "Removed → Added" pairs ────────────────────────
subs_long <- pairs_with_types %>%
filter(Practice == "Substitution") %>%
mutate(Substitution_pairs = coalesce(Substitution_pairs, "")) %>%
separate_rows(Substitution_pairs, sep = ";\\s*") %>%
mutate(Substitution_pairs = str_trim(Substitution_pairs)) %>%
filter(Substitution_pairs != "") %>%
separate(Substitution_pairs, into = c("Removed", "Added"),
sep = "\\s*→\\s*", remove = FALSE) %>%
filter(!Removed %in% c("", "Unclassified"),
!Added %in% c("", "Unclassified"))
# keep only pairs with enough points
subs_keep <- subs_long %>%
count(Removed, Added, name = "n") %>%
filter(n >= MIN_N)
subs_long <- subs_long %>%
inner_join(subs_keep, by = c("Removed","Added")) %>%
mutate(
pair_lab = paste0(Removed, " → ", Added, " (n=", n, ")")
) %>%
mutate(
pair_lab = forcats::fct_reorder(
pair_lab, delta_EI_WG,
.fun = ~ median(.x, na.rm = TRUE),
.desc = TRUE
)
)
# ── Plot B: ΔEI by SUBSTITUTION PAIR ─────────────────────────────────────────
p_sub <- ggplot(
subs_long,
aes(x = delta_EI_WG, y = pair_lab)
) +
geom_vline(xintercept = 0, linetype = 2, colour = "grey65") +
geom_boxplot(outlier.shape = NA, fill = "grey85") +
geom_point(aes(size = Magnitude_pct), alpha = .6) + # size encodes % DM swapped
scale_size(range = c(1.5, 6), name = "% diet DM moved") +
scale_x_continuous("Δ EI (kg CH₄ per kg live-weight gain)", labels = comma) +
labs(y = NULL, title = "Change in emission intensity by SUBSTITUTION (Removed → Added)") +
theme_bw() +
theme(panel.grid.minor = element_blank(),
legend.position = "right")
if (FACET_BY_SPECIES) {
p_sub <- p_sub + facet_wrap(~ Species, ncol = 1, scales = "free_y")
}
p_sub
# ── 3) (Optional) quick tables you may want in the text/report --------------
add_summ <- added_long %>%
group_by(Category_added) %>%
summarise(n = n(),
median_delta = median(delta_EI_WG, na.rm = TRUE),
iqr_delta = IQR(delta_EI_WG, na.rm = TRUE),
.groups = "drop") %>%
arrange(median_delta)
sub_summ <- subs_long %>%
group_by(Removed, Added) %>%
summarise(n = n(),
median_delta = median(delta_EI_WG, na.rm = TRUE),
iqr_delta = IQR(delta_EI_WG, na.rm = TRUE),
.groups = "drop") %>%
arrange(median_delta)
add_summ
## # A tibble: 4 × 4
## Category_added n median_delta iqr_delta
## <chr> <int> <dbl> <dbl>
## 1 VI 5 -0.108 0.306
## 2 II 5 -0.000818 0.0539
## 3 IV 10 0.00454 0.0441
## 4 V 4 0.0131 0.0149
sub_summ
## # A tibble: 0 × 5
## # ℹ 5 variables: Removed <chr>, Added <chr>, n <int>, median_delta <dbl>,
## # iqr_delta <dbl>