Automated Extraction from PDFs (R+GPT‑4o‑mini)

Overview

This guide presents a compact, end‑to‑end workflow that automatically turns stacks of livestock‑research PDFs into clean, structured data on the animals’ physiological or production stage. It harvests files from designated folders, applies OCR when needed, slices out the Abstract and Methods, and then asks GPT‑4o‑mini to extract the relevant herd‑stage descriptors.The approach is compelling because it fuses conventional PDF parsing with state‑of‑the‑art language modelling, proving that what once took hours of human effort can now be reproduced in minutes and for mere cents.

Extract the text from PDFs

Before we involve GPT, we need clean, well-structured text. This chunk automates that prep work: it scans every livestock PDF in the chosen folders, fixes tricky layouts (like two-column pages), runs OCR when the pages are just images, and neatly slices out only the Abstract and Methods parts we care about. It then stores those excerpts—one row per paper—in a master table and marks any suspicious files for manual review. In short, it turns a messy pile of PDFs into a tidy dataset that’s ready for accurate, low-cost GPT extraction.

########## 0  Libraries & quiet Poppler #######################################
library(pdftools)  ; options(pdftools.quiet = TRUE)
library(stringr)   ; library(dplyr)   ; library(tibble)
library(future.apply);
library(tidyr)     ; library(purrr)   ; library(DT)
library(glue)      ; library(jsonlite); library(ellmer)

########## 1  PDF list ########################################################
open_dir  <- "C:/Users/mlolita/OneDrive - CGIAR/ERA/Data Entry/pdfs/Livestock/Open access"
close_dir <- "C:/Users/mlolita/OneDrive - CGIAR/ERA/Data Entry/pdfs/Livestock/Closed access"

all_pdfs <- c(
  list.files(open_dir,  "\\.pdf$", full.names = TRUE, recursive = TRUE),
  list.files(close_dir, "\\.pdf$", full.names = TRUE, recursive = TRUE)
)
#set.seed(1); all_pdfs <- sample(all_pdfs, 200)   # demo limit

########## 2  Regex dictionaries ##############################################
abs_s <- "(?im)^\\s*(?:Abstract|Summary|Résumé|Resumen|A\\s+B\\s+S\\s+T\\s+R\\s+A\\s+C\\s+T)\\s*[:\\.]?$"
abs_e <- "(?i)\\b(?:\\d{0,2}[\\.:\\)]?\\s*)?(Introduction|Background)\\b"
met_s <- "(?i)(?:\\d+(?:\\.\\d+)*\\s*)?(?:Materials?\\s*(?:and|&)\\s*Methods?|Methods?\\s*(?:and|&)\\s*Materials?|Methodology\\b|(?-i:METHODS)\\b)"
met_e <- "\\b(Results?|Discussion|Conclusion)\\b"

########## 3  Helpers for two-column handling #################################
has_pages <- "pages" %in% names(formals(pdf_text))

# 3a ── identify 2-column layout on a single page -----------------------------
is_two_col <- function(pg, gap = 50) {
  first_x <- pg %>%                      # pg = pdf_data(path)[[page]]
    group_by(y) %>% summarise(x = min(x), .groups = "drop") %>% pull(x)
  if (length(first_x) < 10)              return(FALSE)
  if (length(unique(first_x)) < 2)       return(FALSE)   # all lines start same x
  km <- stats::kmeans(first_x, centers = 2, iter.max = 10)
  abs(diff(sort(km$centers))) > gap                      # TRUE → two columns
}

# 3b ── rebuild one page left→right ------------------------------------------
reflow_page <- function(path, page_no) {
  pg <- pdf_data(path)[[page_no]]
  pg <- pg %>%
    mutate(col_id = ifelse(x < median(x), 1, 2)) %>%
    arrange(col_id, y, x)
  paste(pg$text, collapse = " ")
}

########## 4  Fast first-pages reader (2-col safe + OCR fallback) #############
grab_pages <- function(path, n = 6) {

  n_tot <- tryCatch(pdf_info(path)$pages, error = function(e) 0L)
  if (n_tot == 0L) return("")                               # unreadable PDF

  out <- character(min(n, n_tot))

  for (p in seq_along(out)) {

    raw <- if (has_pages) pdf_text(path, pages = p) else pdf_text(path)[p]
    if (is.null(raw) || nchar(raw) == 0) next               # empty page

    pg  <- tryCatch(pdf_data(path)[[p]], error = function(e) NULL)
    out[p] <- if (!is.null(pg) && is_two_col(pg))
                reflow_page(path, p) else raw
  }

  joined <- paste(out, collapse = "\n")
  if (nchar(joined) > 400) return(joined)                   # good enough

  ocr <- if (has_pages)
           pdf_ocr_text(path, pages = seq_len(min(n, n_tot)),
                        dpi = 200, language = "eng")
         else
           pdf_ocr_text(path, dpi = 200, language = "eng")[seq_len(min(n, n_tot))]
  paste(ocr, collapse = "\n")
}

########## 5  Flexible slicers ###############################################
slice_generic <- function(txt, start_pat, end_pat) {
  st <- str_locate(txt, start_pat)[1, 1]
  if (is.na(st)) return(NA_character_)
  stops <- str_locate_all(txt, end_pat)[[1]]
  stops <- stops[stops[,1] > st, , drop = FALSE]
  ed <- if (nrow(stops)) stops[1,1] - 1 else nchar(txt)
  str_trim(str_sub(txt, st, ed))
}

slice_abstract <- function(txt) {
  st <- str_locate(txt, abs_s)[1, 1]
  if (is.na(st)) st <- 1
  stops <- str_locate_all(txt, abs_e)[[1]]
  stops <- stops[stops[,1] > st, , drop = FALSE]
  if (!nrow(stops)) return(NA_character_)
  ed <- stops[1,1] - 1
  str_trim(str_sub(txt, st, ed))
}

########## 6  Per-file worker #################################################
worker <- function(path) {
  raw <- grab_pages(path, 6)
  tibble(
    File     = basename(path),
    Abstract = slice_abstract(raw),
    Methods  = slice_generic(raw, met_s, met_e),
    Raw      = list(raw)
  )
}

plan(multisession)

########## 7  Batch run & flag (parallel + RNG-safe) ##########################
articles_tbl <- future_lapply(
    all_pdfs,
    worker,
    future.seed = TRUE
  ) %>%
  bind_rows() %>%
  mutate(
    Needs_Review = if_else(
      is.na(Methods) | str_length(Methods) < 500,
      "⚠️ Yes",
      "No"
    )
  )



########## 8  Raw-text inspector #############################################
inspect_pdf <- function(file_name, pages = 6) {
  path <- all_pdfs[basename(all_pdfs) == file_name]
  if (!length(path)) stop("File not in current sample.")
  cached <- articles_tbl$Raw[articles_tbl$File == file_name][[1]]
  if (pages <= 3) return(cat(cached))
  cat(grab_pages(path, pages))
}

articles_tbl <- articles_tbl %>%
  mutate(Code = str_extract(File, "^[A-Z]+\\d+"))

Filter for only sheep cattle and goat papers

# Set up connection to S3 bucket
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3,"data"))

# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData", grep("skinny_cow_2022", files, value=TRUE), value=TRUE), 1)

# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if(!file.exists(save_path)){
  s3$file_download(files, save_path, overwrite = TRUE)
}

# Load the data
livestock_metadata <- miceadds::load.Rdata2(file=basename(save_path), path=dirname(save_path))

subset_animals <- livestock_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat"))

codes<-subset_animals$B.Code
papers_to_extract<-articles_tbl%>%
  filter(Code %in% codes)

GPT settings

chat <- chat_openai(model = "gpt-4o-mini")

Training data

In courageous camel we did extract herd info. We can then use this dataset as a training dataset for our method

s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3,"data"))
## Warning in .mapply(list, x, NULL): longer argument not a multiple of length of
## shorter
# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData", grep("courageous_camel_2024", files, value=TRUE), value=TRUE), 1)

# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if(!file.exists(save_path)){
  s3$file_download(files, save_path, overwrite = TRUE)
}


# Load the data
camel <- miceadds::load.Rdata2(file=basename(save_path), path=dirname(save_path))

subset_animals <- camel$Herd.Out %>%
  filter(V.Product %in% c("Cattle", "Sheep", "Goat"))

codes_training<-subset_animals$B.Code

herd_training<-camel$Herd.Out%>%
   select(c("B.Code","Herd.Sex","Herd.Stage","Herd.Parity"))%>%
  unique()

papers_to_train<-articles_tbl%>%
  filter(Code %in% codes_training)

GPT extraction

# ── prompt builder -----------------------------------------------------------
make_herd_prompt <- function(section_name, txt) {
  glue::glue("
You are reading the {section_name} of a livestock research paper.

Text:
\"\"\"
{txt }
\"\"\"

Identify the physiological or production stage and sex of the animals experimented in the study (e.g., Adult Male,Gestating Female, Lactating Female, Young Male, Calf Nursing Female, Yearling Female etc..).Extract the stage exactly as it appears in the text and deduce the sex.
Return only one stage if the study focuses on a single category.
If the study clearly reports measures for more than one distinct stage, list them comma-separated.
Return each stage once only. If no stage is found, return NA.
Be precise and concise. Respond with the correct stage(s) and sex only — no explanation.
")
}

# ── single-call extractor ----------------------------------------------------
extract_herd_stage <- function(section_name, txt) {
  if (is.na(txt) || !nzchar(txt)) return(NA_character_)
  prompt  <- make_herd_prompt(section_name, txt)
  answer  <- chat_openai(model = "gpt-4o-mini")$chat(prompt)
  
  stage <- trimws(answer)
  if (!nzchar(stage)) return(NA_character_)
  stage
}

# ── process 100 “no-review” papers row-by-row ---------------------------------
herd_tbl <- papers_to_train %>%
  filter(Needs_Review == "No") %>%
  rowwise() %>%
  mutate(
    Herd_Abstract = extract_herd_stage("Abstract", Abstract),
    Herd_Methods  = extract_herd_stage("Methods",  Methods)
  ) %>%
  ungroup() %>%
  select(File, Abstract, Herd_Abstract, Methods, Herd_Methods)
# Ensure codes align in both tables
herd_tbl_with_code <- papers_to_train %>%
  filter(Needs_Review == "No") %>%
  select(Code) %>%
  bind_cols(herd_tbl)

# Join GPT results with gold-standard training data
comparison_tbl <- herd_tbl_with_code %>%
  left_join(herd_training, by = c("Code" = "B.Code"))
## Warning in left_join(., herd_training, by = c(Code = "B.Code")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 182 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
subset<-comparison_tbl%>%
  select(c("Code","Herd_Abstract","Herd_Methods","Herd.Sex","Herd.Stage","Herd.Parity"))


#write_csv(subset, "herd_extraction_data2.csv")          # comma-separated

Verification of training data

# 0 ── simple one-to-one synonym dictionary  ----------------------------------
syn_map <- c(
  lamb  = "young",
  lambs = "young",
  kid   = "young",
  kids  = "young",
  growing="young"
  # add more if needed, e.g.  "calf" = "young"
)

# 1 ── Tokeniser: returns full phrases *and* single words, with synonym swap ---
tidy_tokens <- function(x) {
  if (is.na(x) || !nzchar(trimws(x))) return(character(0))

  # A) split the *lists* of categories
  phrases <- str_split(
    x,
    pattern = ",|;|/|\\band\\b|\\b&\\b",
    simplify = TRUE
  ) |>
    str_trim() |>
    str_to_lower() |>
    str_replace_all("[^a-z0-9 ]+", "") |>
    discard(~ .x == "")

  # B) swap synonyms inside phrases
  for (i in seq_along(syn_map)) {
    pat <- paste0("\\b", names(syn_map)[i], "\\b")
    phrases <- str_replace_all(phrases, pat, syn_map[i])
  }

  # C) split phrases into single words
  words <- str_split(phrases, "\\s+", simplify = FALSE) |>
    unlist()

  # crude singulariser
  words <- sub("s$", "", words)

  # D) synonym swap on single words, too
  words <- recode(words, !!!syn_map, .default = words)

  unique(c(phrases, words))
}

# 2 ── Per-row evaluation -----------------------------------------------------
eval_tbl <- subset |>                         # ← your existing tibble
  mutate(
    abs_tok     = map(Herd_Abstract, tidy_tokens),
    met_tok     = map(Herd_Methods,  tidy_tokens),
    sex_truth   = map(Herd.Sex,      tidy_tokens),
    stage_truth = map(Herd.Stage,    tidy_tokens),

    sex_abs_match   = map2_lgl(abs_tok,   sex_truth,
                               \(p, t) if (length(t)) any(t %in% p) else NA),
    sex_met_match   = map2_lgl(met_tok,   sex_truth,
                               \(p, t) if (length(t)) any(t %in% p) else NA),
    stage_abs_match = map2_lgl(abs_tok,   stage_truth,
                               \(p, t) if (length(t)) any(t %in% p) else NA),
    stage_met_match = map2_lgl(met_tok,   stage_truth,
                               \(p, t) if (length(t)) any(t %in% p) else NA)
  )

# 3 ── Quick accuracy table ---------------------------------------------------
accuracy_overview <- eval_tbl |>
  summarise(
    n              = n(),
    sex_abs_acc    = mean(sex_abs_match,   na.rm = TRUE),
    sex_met_acc    = mean(sex_met_match,   na.rm = TRUE),
    stage_abs_acc  = mean(stage_abs_match, na.rm = TRUE),
    stage_met_acc  = mean(stage_met_match, na.rm = TRUE)
  )

# helper: fallback if `scales::percent()` is unavailable --------------------
to_pct <- function(x, digits = 1) sprintf(paste0("%.", digits, "f %%"), 100 * x)

accuracy_tbl <- accuracy_overview %>% 
  transmute(
    `Papers (n)`        = n,
    `Sex – Abstract`    = to_pct(sex_abs_acc),
    `Sex – Methods`     = to_pct(sex_met_acc),
    `Stage – Abstract`  = to_pct(stage_abs_acc),
    `Stage – Methods`   = to_pct(stage_met_acc)
  )

knitr::kable(
  accuracy_tbl,
  format   = "markdown",
  align    = c("r","c","c","c","c"),
  caption  = "Table 1. Token-match accuracy of GPT extraction (higher = better)."
)
Table 1. Token-match accuracy of GPT extraction (higher = better).
Papers (n) Sex – Abstract Sex – Methods Stage – Abstract Stage – Methods
178 74.4 % 84.4 % 82.3 % 89.8 %