*Edit: Thanks to Martin and a little bit of time and attention, I was able to get the code where I needed it to be. Is it ugly? Yes, but it works in way that's useful to me now. Any tips on how to clean this up and make it more efficient would be super helpful.
Using the data frame trace_list
, I'm trying to append the values from Title
and Year
to the output of each list in the for loop. The following code opens each state's PDF link on page 10, pulls the city data (which ranges from 1-12 cities). Clean/tidies the data, and stores it in a list to be bound after data from each PDF is collected. Right now it only pulls the city name and a numerical value.
data.frame(Link = c('https://www.atf.gov/file/146951/download','https://www.atf.gov/file/146966/download','https://www.atf.gov/file/146976/download','https://www.atf.gov/file/137041/download','https://www.atf.gov/file/137231/download','https://www.atf.gov/file/137301/download','https://www.atf.gov/docs/undefined/flwebsite17183911pdf/download','https://www.atf.gov/docs/undefined/kywebsite17183876pdf/download','https://www.atf.gov/docs/undefined/prwebsite17183917pdf/download'), Title = c('Alabama','California','District of Columbia','Alaska','Pennsylvania','Wyoming','Florida','Kentucky','Puerto Rico'), Year = c('2019','2019','2019','2018','2018','2018','2017','2017','2017'))
library(pdftools)
library(dplyr)
library(tabulizer)
library(english)
library(gsubfn)
library(rebus)
library(htmlwidgets)
citytrace <- list()
trace_list <- as.data.frame(trace_list)
for (i in seq_len(nrow(trace_list[c(1:159),]))) {
pdf_link <- trace_list[i, 1]
pdf_link <- paste0('',pdf_link,'')
gpi_table <- tabulizer::extract_tables(pdf_link,output = "data.frame",pages = c(10, 10), area = list(c(230,0,280,717), c(275,0,321,725)),guess = F)
city <- list(gpi_table)[[1]][[1]]
#city <- city[!Reduce(`|`, lapply(city[1], grepl, pattern = '^[0-9]+$""')),]
city[city==""] <- NA
city <- setNames(rbind(names(city), city), names(city))
colnames(city) <- paste("V",seq(1,length(city),1),sep="")
#city <- ifelse(city[1,]=='city',city[-1,],city)
city <- if(length(city)>4){
a <- data.frame(t(city))
colnames(a) <- paste("X",seq(1,length(a),1),sep="")
a[,1] <- factor(paste(a$X1,a$X2,a$X3,a$X4, sep = " "))
a[,1] <- trimws(gsub("X|X\\.[[:digit:]]|\\.[[:digit:]]","",a$X1))
a <- a[,-c(2:4)]
} else {
city %>%
unite(city, 1:length(city), sep = " ", remove = FALSE) %>%
mutate_all(na_if,"") %>%
drop_na() %>%
mutate(city = trimws(city), city = str_replace(city," "," ")) %>%
select(city)
}
city <- ifelse(grepl(c("X|[[:digit:]]"),city),sapply(city, function(x) gsub(c('"*"|[[:digit:]]+|X|,|\\.|^c\\(|\\)$|'),"",x)),sapply(city, function(x) gsub("\\."," ",x)))
city <- unique(data.frame(matrix(unlist(city), nrow=length(city), byrow=TRUE)))
city[which(city=="" | city=="NA"),] <- NA
city <- city[complete.cases(city), , drop=FALSE]
colnames(city) <- "city"
count <- list(gpi_table)[[1]][[2]]
count <- setNames(rbind(names(count), count), names(count))
colnames(count) <- paste("V",seq(1,length(count),1),sep="")
count <- ifelse(grepl("^X[[:digit:]]+$|^X[[:digit:]]+\\.[[:digit:]]+$",count),sapply(count,function(x) gsub("X|\\.[[:digit:]]$","",x)),count)
count <- cbind(city,count)
library(english)
library(gsubfn)
result <- pdf_text(pdf_link)[10] %>%
str_split("\n") %>%
first() %>%
as_tibble() %>%
mutate_all(list(~na_if(.,""))) %>%
filter(grepl("NOTE:|determined",value))
x3 <- ifelse(is.na(result[2,]),paste(result[1,], result[2,], sep=". "),paste(result[1,], result[2,], sep=" "))
x3 <- dplyr::tibble(line = 1, text = x3)
sv <- strsplit(x3$text, split = "\\. ")
x3 <- data.frame(V1 = rep(x3$line, sapply(sv, length)), V2 = unlist(sv))
x3[1,2] <- gsubfn("\\w+", setNames(as.list(1:10), as.english(1:10)), x3[1,2])
x3[2,2] <- gsubfn("\\w+", setNames(as.list(1:10), as.english(1:10)), x3[2,2])
x3$V2 <- gsub(",","",x3$V2)
x3$V2 <- gsub("NA",0,x3$V2)
x3$city <- ifelse(grepl("additional",x3$V2),"Other","None")
library(rebus)
library(htmlwidgets)
trcount <- DGT %R% optional(DGT) %R% optional(DGT) %R% optional(DGT) %R% optional(DGT)
str_view_all(x3$V2,
pattern = trcount)
a0 <- str_match_all(x3$V2, pattern = trcount)
a0[[1]] <- a0[[1]][-1,1]
x3$count <- unlist(a0)
x4 <- as.data.frame(x3[,-c(1:2)])
x5 <- rbind(count,x4)
x5 <- x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
citytrace[[i]] <- x5
}
citytrace <- do.call(rbind,citytrace)
citytrace$city <- gsub(c(' NA| '), '', citytrace$city)
citytrace$count <- gsub(c('\\.'), '', citytrace$count)
print(citytrace)
What I'm having trouble with is assigning the values in 'Title' and 'Year' from 'trace_list' to the looped output. Expected results below:
city | count | state | year |
---|---|---|---|
Birmingham | 100 | Alabama | 2019 |
Fairbanks | 10 | Alaska | 2018 |
I'm not exactly sure how to start doing that and was looking for help with that. Any advice on how to clean up the code is greatly appreciated.
Since I can't run your code here a small suggestion for your code
library(dplyr)
for (i in seq_len(nrow(trace_list))) {
pdf_link <- trace_list[i, 1]
# Do stuff with the URL
# probably you don't need the inner for-loop
# create the data.frame x5
x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
}
Some remarks:
for
-loop: iterating over list(trace_list[c(1:2),])
doesn't make any sense for me.url
in pdf_link
is better done by pdf_link <- trace_list[i, 1]
.unlist(pdf_link)
also doesn't make sense to me. I think you can use trace_list[i, 1]
, trace_list[i, 2]
or trace_list[i, 3]
instead.