## table_extract
## extract table contents along with headers
## this allows sample names to be retrieved too
## uses heading underline separators to determine column widths
library(pdftools)
library(stringr)
library(dplyr)
<- 'test_01'
folder
## identify files
<- list.files(path = folder, pattern = '.pdf', full.names = TRUE)
f_names
## loop over all files
<- lapply(f_names, function(f) {
l.df
<- pdf_text(f)
txt
## split text at carriage return
<- strsplit(txt, '\r\n')
txt1
## loop over lines and retrieve table contents
<- list()
l <- list()
l_all <- FALSE
read_table for (i in 1:length(txt1)) { # loop through pages
for (j in 1:length(txt1[[i]])) { # loop through lines
<- txt1[[i]][j]
line
if (str_detect(line, 'Totals :')) { # end of record
<- FALSE
read_table <- as.data.frame(do.call(rbind, l_data), stringsAsFactors = FALSE)
df.data $date <- l[['date']]
df.data$data_file <- l[['data_file']]
df.data$sample_name <- l[['sample_name']]
df.data$pdf_file <- basename(f)
df.data$page <- i
df.data'data']] <- df.data
l[[length(l_all) + 1]] <- l
l_all[[<- list() ## reset list
l else if (str_detect(line, 'Injection Date')) { # found inj date
} <- as.POSIXct(str_extract(line, '\\d+/\\d+/\\d+ \\d+:\\d+:\\d+ [A|P]M'), format = '%m/%d/%Y %I:%M:%S %p')
inj_date $date <- inj_date
l<- list()
l_data else if (str_detect(line, 'Peak RetTime')) { # found table
} <- line ## save table headers for later use
table_headers <- TRUE
read_table else if (str_detect(line, 'Data File')) {
} 'data_file']] <- basename(txt1[[2]][1])
l[[else if (str_detect(line, 'Sample Name:')) {
} 'sample_name']] <- str_replace(line, 'Sample Name: ', '')
l[[
}
if (read_table) {
if (str_detect(line, '\\[min\\]')) { # found second header line
<- line ## save table headers for later use
table_headers_2
}
if (!str_detect(line, '\\[min\\]') & !str_detect(line, 'Peak RetTime')) {
if (str_detect(line, '--|--')) { ## found separator line
<- str_locate_all(line, '\\|')
locate_separators <- c(1, locate_separators[[1]][,1], nchar(line))
locate_separators <- sapply(seq_along(locate_separators[-1]), function(x) {
txt_headings trimws(substring(table_headers, locate_separators[x], locate_separators[x+1]))
})
<- sapply(seq_along(locate_separators[-1]), function(x) {
txt_headings_2 trimws(substring(table_headers_2, locate_separators[x], locate_separators[x+1]))
})
<- sapply(seq_along(txt_headings), function(x) {
txt_headings if (nchar(txt_headings_2[x]) > 0) {
sprintf('%s (%s)', txt_headings[x], txt_headings_2[x])
else {
}
txt_headings[x]
}
})
else { ## found data
}
## add a space to the end of the line
## necessary for identifying end-of-data points
<- paste0(line, ' ')
line
<- sapply(seq_along(locate_separators[-1]), function(x) {
data
## sometimes text pull needs to start a few characters back
## (due to import conversion of tabs to spaces)
<- locate_separators[x]
start_pos <- locate_separators[x+1]
end_pos
if (substring(line, start_pos, start_pos) != ' ') {
<- FALSE
found_space while (!found_space & start_pos > 2) {
<- start_pos - 1
start_pos if (substring(line, start_pos, start_pos) == ' ') found_space <- TRUE
}
}
if (substring(line, end_pos, end_pos) != ' ') {
<- FALSE
found_space while (!found_space & end_pos > 2) {
<- end_pos - 1
end_pos if (substring(line, end_pos, end_pos) == ' ') found_space <- TRUE
}
}
trimws(substring(line, start_pos, end_pos))
})length(l_data) + 1]] <- setNames(data, txt_headings)
l_data[[
}
}
}
}
}
## join tables
<- do.call(rbind, lapply(l_all, function(x) x$data))
df
## convert table columns to numeric
for (n in names(df)) {
if (all(grepl("^(-|\\+)?((\\.?\\d+)|(\\d+\\.\\d+)|(\\d+\\.?)|(\\d*\\.*\\d+[E|e](-|\\+)\\d+))$", df[[n]]))) {
<- as.numeric(df[[n]])
df[[n]]
}
}
df
})
<- do.call(rbind, l.df)
df_all
write.csv(df_all, 'table_out.csv', row.names = FALSE)
Here’s an example of using R to extract tabular content and metadata from a series of pdf files and push the data into a single table. The concept is fairly straightforward - read in PDF and convert to text, parse each line and use regular expressions to extract pertinent information.
Three types of data are returned:
- peak data. Peak number, retention time, type, width, area and name. In fact all columns will be extracted from the table along with the column identifiers and units.
- metadata. In this case injection date, data file name and sample name are pulled from the pdfs.
- file data. The pdf file name and page containing the peak table.
The pdftools
library is used to read in the PDF files.
Unwanted or Missing spaces
So why use column identifiers from the table header instead of just splitting table columns using a delimeter such as a tab? Unfortunately tabular information is lost when the pdf is imported and any tabs are converted into spaces. A space character column delimeter could be used if all columns are populated and the sample names did not contain spaces. For these tables, however, the Type column is not always populated and the Name column can contain spaces. Therefore there is no way to be sure that we are accurately identifying the end of one column and beginning of another using delimeters.
An additional issue arises because when the pdf is imported some unexpected formatting issues can arise, particularly with respect to unwanted or missing spaces. To account for this when parsing the table we can nudge forward or back to identify spaces between values. This can be seen in the alignment of line 3 of the table below (pdf vs parsed text).