Extracting data from LCMS PDF

Here’s an example of using R to extract tabular content and metadata from a series of pdf files and push the data into a single table. The concept is fairly straightforward - read in PDF and convert to text, parse each line and use regular expressions to extract pertinent information.

Three types of data are returned:

peak data. Peak number, retention time, type, width, area and name. In fact all columns will be extracted from the table along with the column identifiers and units.
metadata. In this case injection date, data file name and sample name are pulled from the pdfs.
file data. The pdf file name and page containing the peak table.

The pdftools library is used to read in the PDF files.

Unwanted or Missing spaces

So why use column identifiers from the table header instead of just splitting table columns using a delimeter such as a tab? Unfortunately tabular information is lost when the pdf is imported and any tabs are converted into spaces. A space character column delimeter could be used if all columns are populated and the sample names did not contain spaces. For these tables, however, the Type column is not always populated and the Name column can contain spaces. Therefore there is no way to be sure that we are accurately identifying the end of one column and beginning of another using delimeters.

An additional issue arises because when the pdf is imported some unexpected formatting issues can arise, particularly with respect to unwanted or missing spaces. To account for this when parsing the table we can nudge forward or back to identify spaces between values. This can be seen in the alignment of line 3 of the table below (pdf vs parsed text).

Code

## table_extract
## extract table contents along with headers
## this allows sample names to be retrieved too
## uses heading underline separators to determine column widths

library(pdftools)
library(stringr)
library(dplyr)

folder <- 'test_01'

## identify files
f_names <- list.files(path = folder, pattern = '.pdf', full.names = TRUE)

## loop over all files
l.df <- lapply(f_names, function(f) {
  
  txt <- pdf_text(f)
  
  ## split text at carriage return
  txt1 <- strsplit(txt, '\r\n')
  
  ## loop over lines and retrieve table contents
  l <- list()
  l_all <- list()
  read_table <- FALSE
  for (i in 1:length(txt1)) { # loop through pages
    for (j in 1:length(txt1[[i]])) { # loop through lines
      line <- txt1[[i]][j]
      
      if (str_detect(line, 'Totals :')) { # end of record
        read_table <- FALSE
        df.data <- as.data.frame(do.call(rbind, l_data), stringsAsFactors = FALSE)
        df.data$date <- l[['date']]
        df.data$data_file <- l[['data_file']]
        df.data$sample_name <- l[['sample_name']]
        df.data$pdf_file <- basename(f)
        df.data$page <- i
        l[['data']] <- df.data
        l_all[[length(l_all) + 1]] <- l
        l <- list()  ## reset list
      } else if  (str_detect(line, 'Injection Date')) { # found inj date
        inj_date <- as.POSIXct(str_extract(line, '\\d+/\\d+/\\d+ \\d+:\\d+:\\d+ [A|P]M'), format = '%m/%d/%Y %I:%M:%S %p')
        l$date <- inj_date
        l_data <- list()
      } else if (str_detect(line, 'Peak RetTime')) { # found table
        table_headers <- line   ## save table headers for later use
        read_table <- TRUE
      } else if (str_detect(line, 'Data File')) {
        l[['data_file']] <- basename(txt1[[2]][1])
      } else if (str_detect(line, 'Sample Name:')) {
        l[['sample_name']] <- str_replace(line, 'Sample Name: ', '')
      }
      
      if (read_table) {
        
        if (str_detect(line, '\\[min\\]')) { # found second header line
          table_headers_2 <- line   ## save table headers for later use
        }
        
        if (!str_detect(line, '\\[min\\]') & !str_detect(line, 'Peak RetTime')) {
          if (str_detect(line, '--|--')) { ## found separator line
            locate_separators <- str_locate_all(line, '\\|')
            locate_separators <- c(1, locate_separators[[1]][,1], nchar(line))
            txt_headings <- sapply(seq_along(locate_separators[-1]), function(x) {
              trimws(substring(table_headers, locate_separators[x], locate_separators[x+1]))
            })
            
            txt_headings_2 <- sapply(seq_along(locate_separators[-1]), function(x) {
              trimws(substring(table_headers_2, locate_separators[x], locate_separators[x+1]))
            })
            
            txt_headings <- sapply(seq_along(txt_headings), function(x) {
              if (nchar(txt_headings_2[x]) > 0) {
                sprintf('%s (%s)', txt_headings[x], txt_headings_2[x])
              } else {
                txt_headings[x]
              }
            })
            
          } else {  ## found data
            
            ## add a space to the end of the line
            ## necessary for identifying end-of-data points
            line <- paste0(line, ' ')
            
            data <- sapply(seq_along(locate_separators[-1]), function(x) {
              
              ## sometimes text pull needs to start a few characters back
              ## (due to import conversion of tabs to spaces)
              start_pos <- locate_separators[x]
              end_pos <- locate_separators[x+1]
              
              if (substring(line, start_pos, start_pos) != ' ') {
                found_space <- FALSE
                while (!found_space & start_pos > 2) {
                  start_pos <- start_pos - 1
                  if (substring(line, start_pos, start_pos) == ' ') found_space <- TRUE
                }
              }
              
              if (substring(line, end_pos, end_pos) != ' ') {
                found_space <- FALSE
                while (!found_space & end_pos > 2) {
                  end_pos <- end_pos - 1
                  if (substring(line, end_pos, end_pos) == ' ') found_space <- TRUE
                }
              }
              
              trimws(substring(line, start_pos, end_pos))
            })
            l_data[[length(l_data) + 1]] <- setNames(data, txt_headings)
          }
        }
      }
    }
  }
  
  ## join tables
  df <- do.call(rbind, lapply(l_all, function(x) x$data))
  
  ## convert table columns to numeric
  for (n in names(df)) {
    if (all(grepl("^(-|\\+)?((\\.?\\d+)|(\\d+\\.\\d+)|(\\d+\\.?)|(\\d*\\.*\\d+[E|e](-|\\+)\\d+))$", df[[n]]))) {
      df[[n]] <- as.numeric(df[[n]])
    }
  }
  df
})

df_all <- do.call(rbind, l.df)

write.csv(df_all, 'table_out.csv', row.names = FALSE)

Unwanted or Missing spaces

Code

Input File Example