Search code examples
rfor-looppdfpdftools

Append values from a data frame to a list created in for loop


*Edit: Thanks to Martin and a little bit of time and attention, I was able to get the code where I needed it to be. Is it ugly? Yes, but it works in way that's useful to me now. Any tips on how to clean this up and make it more efficient would be super helpful.

Using the data frame trace_list, I'm trying to append the values from Title and Year to the output of each list in the for loop. The following code opens each state's PDF link on page 10, pulls the city data (which ranges from 1-12 cities). Clean/tidies the data, and stores it in a list to be bound after data from each PDF is collected. Right now it only pulls the city name and a numerical value.

data.frame(Link = c('https://www.atf.gov/file/146951/download','https://www.atf.gov/file/146966/download','https://www.atf.gov/file/146976/download','https://www.atf.gov/file/137041/download','https://www.atf.gov/file/137231/download','https://www.atf.gov/file/137301/download','https://www.atf.gov/docs/undefined/flwebsite17183911pdf/download','https://www.atf.gov/docs/undefined/kywebsite17183876pdf/download','https://www.atf.gov/docs/undefined/prwebsite17183917pdf/download'), Title = c('Alabama','California','District of Columbia','Alaska','Pennsylvania','Wyoming','Florida','Kentucky','Puerto Rico'), Year = c('2019','2019','2019','2018','2018','2018','2017','2017','2017'))
library(pdftools)
library(dplyr)
library(tabulizer)
library(english)
library(gsubfn)
library(rebus)
library(htmlwidgets)

citytrace <- list()
trace_list <- as.data.frame(trace_list)
for (i in seq_len(nrow(trace_list[c(1:159),]))) {
  pdf_link <- trace_list[i, 1]
  pdf_link <- paste0('',pdf_link,'')
  
  gpi_table <- tabulizer::extract_tables(pdf_link,output = "data.frame",pages = c(10, 10), area = list(c(230,0,280,717), c(275,0,321,725)),guess = F)
  
  city <- list(gpi_table)[[1]][[1]]
  #city <- city[!Reduce(`|`, lapply(city[1], grepl, pattern = '^[0-9]+$""')),]
  city[city==""] <- NA
  city <- setNames(rbind(names(city), city), names(city))
  colnames(city) <- paste("V",seq(1,length(city),1),sep="")
  #city <- ifelse(city[1,]=='city',city[-1,],city)
  city <- if(length(city)>4){
    a <- data.frame(t(city))
    colnames(a) <- paste("X",seq(1,length(a),1),sep="")
    a[,1] <- factor(paste(a$X1,a$X2,a$X3,a$X4, sep = " "))
    a[,1] <- trimws(gsub("X|X\\.[[:digit:]]|\\.[[:digit:]]","",a$X1))
    a <- a[,-c(2:4)]
  } else {
    city %>%
      unite(city, 1:length(city), sep = " ", remove = FALSE) %>%
      mutate_all(na_if,"") %>%
      drop_na() %>%
      mutate(city = trimws(city), city = str_replace(city,"  "," ")) %>%
      select(city)
  }
  city <- ifelse(grepl(c("X|[[:digit:]]"),city),sapply(city, function(x) gsub(c('"*"|[[:digit:]]+|X|,|\\.|^c\\(|\\)$|'),"",x)),sapply(city, function(x) gsub("\\."," ",x)))
  city <- unique(data.frame(matrix(unlist(city), nrow=length(city), byrow=TRUE)))
  city[which(city=="" | city=="NA"),] <- NA
  city <- city[complete.cases(city), , drop=FALSE]
  colnames(city) <- "city"
  
  count <- list(gpi_table)[[1]][[2]]
  count <- setNames(rbind(names(count), count), names(count))
  colnames(count) <- paste("V",seq(1,length(count),1),sep="")
  count <- ifelse(grepl("^X[[:digit:]]+$|^X[[:digit:]]+\\.[[:digit:]]+$",count),sapply(count,function(x) gsub("X|\\.[[:digit:]]$","",x)),count)
  count <- cbind(city,count)
  
  library(english)
  library(gsubfn)
  result <- pdf_text(pdf_link)[10]  %>%
    str_split("\n") %>%
    first() %>%
    as_tibble() %>%
    mutate_all(list(~na_if(.,""))) %>%
    filter(grepl("NOTE:|determined",value))
  x3 <- ifelse(is.na(result[2,]),paste(result[1,], result[2,], sep=". "),paste(result[1,], result[2,], sep=" "))
  x3 <- dplyr::tibble(line = 1, text = x3)
  sv <- strsplit(x3$text, split = "\\. ")
  x3 <- data.frame(V1 = rep(x3$line, sapply(sv, length)), V2 = unlist(sv))
  x3[1,2] <- gsubfn("\\w+", setNames(as.list(1:10), as.english(1:10)), x3[1,2])
  x3[2,2] <- gsubfn("\\w+", setNames(as.list(1:10), as.english(1:10)), x3[2,2])
  x3$V2 <- gsub(",","",x3$V2)
  x3$V2 <- gsub("NA",0,x3$V2)
  x3$city <- ifelse(grepl("additional",x3$V2),"Other","None")
  
  library(rebus)
  library(htmlwidgets)
  trcount <- DGT %R% optional(DGT) %R% optional(DGT) %R% optional(DGT) %R% optional(DGT)
  str_view_all(x3$V2, 
               pattern = trcount)
  a0 <- str_match_all(x3$V2, pattern = trcount)
  a0[[1]] <- a0[[1]][-1,1]
  x3$count <- unlist(a0)
  x4 <- as.data.frame(x3[,-c(1:2)])
  x5 <- rbind(count,x4)
  
  x5 <- x5 %>% 
    mutate(state = trace_list[i, 2],
           year  = trace_list[i, 3]) 
  citytrace[[i]] <- x5
}
citytrace <- do.call(rbind,citytrace)
citytrace$city <- gsub(c(' NA|  '), '', citytrace$city)
citytrace$count <- gsub(c('\\.'), '', citytrace$count)
print(citytrace)

What I'm having trouble with is assigning the values in 'Title' and 'Year' from 'trace_list' to the looped output. Expected results below:

city count state year
Birmingham 100 Alabama 2019
Fairbanks 10 Alaska 2018

I'm not exactly sure how to start doing that and was looking for help with that. Any advice on how to clean up the code is greatly appreciated.


Solution

  • Since I can't run your code here a small suggestion for your code

    library(dplyr)
    
    for (i in seq_len(nrow(trace_list))) {
      pdf_link <- trace_list[i, 1]
      # Do stuff with the URL
      # probably you don't need the inner for-loop
      # create the data.frame x5
      
      x5 %>% 
        mutate(state = trace_list[i, 2],
               year  = trace_list[i, 3]) 
    }
    

    Some remarks:

    • I don't understand the definition of your for-loop: iterating over list(trace_list[c(1:2),]) doesn't make any sense for me.
    • Extracting and storing the url in pdf_link is better done by pdf_link <- trace_list[i, 1].
    • Iterating over unlist(pdf_link) also doesn't make sense to me. I think you can use trace_list[i, 1], trace_list[i, 2] or trace_list[i, 3] instead.