Search code examples
rextract

Reading multiple excels into R where required data isn't always in the same cells


I need to import data from hundreds of excels but the data I want to read is in different cells in each excel as the number of hospitals with COVID-19 positive patients changes each day. Below is a snipping of the excel. What I want to pull out are the total numbers for 'Total Inpatients' and 'Inpatients in ICU Wards'. You can see that there is more to the excel beneath these numbers and I don't want to include this information. enter image description here

Below is the data for the excel snipping above to help with reproducing.

    structure(list(...1 = c("COVID-19 Current Inpatients", NA, "Date = 14/06/2024 07:30", 
NA, NA, NA, NA, "Hospital", "XXX Hospital", "XXX Hospital", "XXX Hospital", 
"XXX Hospital", "XXX Hospital", "XXX Hospital", "XXX Hospital", 
"XXX Hospital", "XXX Hospital", "XXX Hospital", "XXX Hospital", 
"XXX Hospital", "XXX Hospital", "XXX Hospital", "XXX Hospital", 
"XXX Hospital", "XXX Hospital", NA, NA, "Facility MRN", NA, "XXX Hospital", 
"N0017216", "N0322410", "N0355277", "N2954256", NA, "XXX Hospital", 
"N0017216", "N0322410", "N0355277", "N2954256", "N2954256"), 
    ...2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    "Name", NA, NA, "Surname, Firstname", "Surname, Firstname", 
    "Surname, Firstname", "Surname, Firstname", NA, NA, "Surname, Firstname", 
    "Surname, Firstname", "Surname, Firstname", "Surname, Firstname", 
    "Surname, Firstname"), ...3 = c(NA, NA, NA, NA, NA, NA, NA, 
    "Total Inpatients", "4", "5", "14", "5", "2", "2", "1", "26", 
    "1", "21", "1", "1", "4", "2", "11", "1", "1", "102", NA, 
    "Sex", NA, NA, "M", "F", "M", "M", NA, NA, "M", "F", "M", 
    "M", "M"), ...4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, "Age", NA, NA, "83", "89", "82", "86", NA, NA, "83", 
    "89", "82", "86", "86"), ...5 = c(NA, NA, NA, NA, NA, NA, 
    NA, "Inpatients in\r\nICU Wards", "0", "0", "0", "0", "0", 
    "0", "0", "3", "0", "0", "0", "0", "0", "0", "2", "0", "0", 
    "5", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), ...6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, "Aboriginal Status", NA, NA, "Neither", "Neither", 
    "Neither", "Neither", NA, NA, "Neither", "Neither", "Neither", 
    "Neither", "Neither"), ...7 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), ...8 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "Admitted", NA, NA, "45452.375", "45452.711805555598", 
    "45454.607638888898", "45456.754166666702", NA, NA, "45452.375", 
    "45452.711805555598", "45454.607638888898", "45456.754166666702", 
    "45456.754166666702"), ...9 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, "Bed Type", NA, NA, "Medical", "Medical", 
    "Medical", "Medical", NA, NA, "Medical", "Medical", "Medical", 
    "Medical", "Medical"), ...10 = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), ...11 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, "Ward", NA, NA, "Ward name", "Ward name", 
    "Ward name", "Ward name", NA, NA, "Ward name", "Ward name", 
    "Ward name", "Ward name", "Ward name"), ...12 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Alert", NA, NA, 
    "COVID-19-Confirmed", "COVID-19-Confirmed", "COVID-19-Confirmed", 
    "COVID-19-Confirmed", NA, NA, "COVID-19-Confirmed", "COVID-19-Confirmed", 
    "COVID-19-Confirmed", "COVID-19-Confirmed", "COVID-19-Confirmed"
    ), ...13 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "* Alert added on 09/06/2024", "* Alert added on 09/06/2024", 
    "* Alert added on 11/06/2024", "* Alert added on 13/06/2024", 
    NA, NA, "* Alert added on 09/06/2024", "* Alert added on 09/06/2024", 
    "* Alert added on 11/06/2024", "* Alert added on 13/06/2024", 
    "* Alert added on 13/06/2024")), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -41L))

I will admit that I have been using ChatGPT to help with this as I am somewhere between novice and intermediate skill levels with R. The code I am using at the moment is below

library(readxl)
library(dplyr)
library(purrr)
library(stringr)
library(openxlsx)

# Define the directory path and file pattern
directory <- "My path"
file_pattern <- 'My file pattern'

# List files matching the pattern in the directory
files <- list.files(path = directory, pattern = file_pattern, full.names = TRUE, recursive = TRUE)

# Function to extract totals from an Excel file
extract_totals <- function(file_path) {
  # Read the Excel file
  df <- readxl::read_excel(file_path, col_names = FALSE)
  
  # Extract date from cell A3
  date_string <- df[3, 1]
  date_value <- str_extract(date_string, "\\d{2}/\\d{2}/\\d{4}") # Extract date in format dd/mm/yyyy
  
  # Identify the last row with numeric data in the 'Total Inpatients' column C and 'Inpatients in ICU Wards' column E
  total_row_index <- which.max(rowSums(sapply(df[, c(3,5)], as.numeric), na.rm = TRUE) > 0)
  
  # Extract and sum the totals if a valid index is found
  if (is.finite(total_row_index) && total_row_index > 0) {
    total_inpatients <- sum(as.numeric(df[total_row_index, 3]), na.rm = TRUE)
    inpatients_icu <- sum(as.numeric(df[total_row_index, 5]), na.rm = TRUE)
  } else {
    total_inpatients <- NA
    inpatients_icu <- NA
  }
  
  return(data.frame(Date = date_value, Total_Inpatients = total_inpatients, Inpatients_ICU_Wards = inpatients_icu))
}

# Extract totals from each file and combine the results
results <- map_dfr(files, extract_totals) |> 
  arrange(by_group = Date)

This seems to be reading in the top numerical data only as for the excel sample I have given, the resulting data from the code I have used is 4 for 'Total Inpatients' and 0 for 'Inpatients in ICU Wards' instead of 102 and 5 respectively.


Solution

  • The following function was tested on a mock Excel file, it might work on real data files.

    Arguments upper_left and lower_right are the arguments you have to worry about. Apparently, all data sets start at row 8, column 1, cell c(8L, 1L) (or "A8").

    But there might be different lower_right columns. The default is unknown row (row set to NA), column 3. Change the defaults if needed.

    The function returns 3 columns, the hospital column and the two numeric columns that follow.

    library(readxl)
    library(cellranger) # needed for cell_limits
    
    read_custom <- function(file, upper_left = c(8L, 1L), lower_right = c(NA, 3L)) {
      df1 <- read_excel(file, range = cell_limits(upper_left, lower_right))
      # It's enough to check the 1st col for NA's.
      # So get the 1st NA, that's where the data we
      # want stops, keep what comes before it
      i <- min(which(is.na(df1[[1L]])))
      if(i > 1L) {
        df1[1:(i - 1L), ]
      } else df1
    }
    
    read_custom(fl)