Вот моя попытка для вас. Я предоставил объяснение в моем коде. Работа с данными PDF не легка; это требует некоторого кодирования. Но как только вы можете найти шаблоны, вы можете очистить данные.
library(tidyverse)
library(pdftools)
library(splitstackshape)
# Import the PDF, create a data frame, remove unnecessary columns and rows.
# I removed rows that were not included in the table.
foo <- pdf_data(pdf = "Client.pdf", opw = "", upw = "") %>%
as.data.frame %>%
select(-c(width, height, space)) %>%
slice(-(1:(grep(x = text, pattern = "^Start") - 1))) # Remove irrelevant rows.
# Get indices for dates. First, I identified which rows contain /, since / indicates dates.
# Then, I further subset the indices so that I can identify where each row of the
# table begins.
ind <- grep(x = foo$text, pattern = "/")
dummy <- c(diff(ind), NA)
ind <- ind[dummy == 1 & complete.cases(dummy)]
# Create a group variable with findInterval(). Concatenate texts for each group.
# Handle some string manipulation. Split the string and create columns.
group_by(foo, id = findInterval(x = 1:n(), vec = ind)) %>%
summarize(text = paste0(text, collapse = "_")) %>%
mutate(text = sub(x = text, pattern = "(Job|Tutor)_(Number|Name)", replacement = "\\1 \\2"),
text = gsub(x = text, pattern = "(?<=,|Project)_", replacement = " ", perl = TRUE)) %>%
cSplit(splitCols = "text", sep = "_", direction = "wide", type.convert = FALSE) -> res
# Set up column names
names(res) <- setNames(nm = c("id", unlist(res[1, ])[-1]))
# Remove the first row since it contains column names.
slice(res, -1) -> res
id Start Finish Job Number Description Tutor
1 1 29/11/2019 29/11/2019 2288288 Project Name, Project Location, Project Type Tutor Name
2 2 2/12/2019 2/12/2019 8282888 Project Name, Project Location, Project type Tutor Name
3 3 9/12/2019 9/12/2019 2828288 Project Name, Project Location, Project Type Tutor Name
4 4 18/12//2019. 20/12/2019 2828289 Project Name, Project Location, Project Type Tutor Name