Search code examples
rrvest

Data extraction from non-html data table


I am trying to extract the data points from the graphic...without any luck. And no clue, why not.

I get always returned "NAs introduced by coercion"

My idea was to read the URL and look through the date in order to identify the part where the values from the graphic are listed. However, it seems I am not able to get hold of them.

url <- "https://www.transfermarkt.com/manuel-neuer/marktwertverlauf/spieler/17259"

data <- readLines(con = url)

data <- paste0(data, collapse = "\n")


start.index <- gregexpr(pattern = "'data':\\[", text = data)
end.index <- gregexpr(pattern = "\\]", text = data)

end.index[[1]] <- end.index[[1]][which(end.index[[1]] > start.index[[1]])[1]]

data <- substring(
    text = data,
    first = start.index[[1]],
    last = end.index[[1]]
)




#date 
#market value 
#club 
#age 


# Get Dates
date.s.index <- gregexpr(pattern = "'datum_mw':'", text = data)
date.e.index <- gregexpr(pattern = "',", text = data)

date.index <- rbind(
    cbind(date.s.index[[1]], 0),
    cbind(date.e.index[[1]], 1)
)

date.index <- date.index[order(date.index[, 1]), ]

d.index <- date.index[, 2] == 0
d.next.index <- c(FALSE, d.index[-length(d.index)])

date.start.index <- date.index[d.index, 1]
date.end.index <- date.index[d.next.index, 1]
date.res <- substring(
    text = data,
    first = date.start.index + attr(date.s.index[[1]], "match.length")[1], 
    last = date.end.index - 1
)
date.res <- gsub(pattern = "\\\\x20", replacement = " ", x = date.res)


# Get Values
value.s.index <- gregexpr(pattern = "'y':", text = data)
value.e.index <- gregexpr(pattern = ",", text = data)

value.index <- rbind(
    cbind(value.s.index[[1]], 0),
    cbind(value.e.index[[1]], 1)
)

value.index <- value.index[order(value.index[, 1]), ]

v.index <- value.index[, 2] == 0
v.next.index <- c(FALSE, v.index[-length(v.index)])

value.start.index <- value.index[v.index, 1]
value.end.index <- value.index[v.next.index, 1]
value.res <- substring(
    text = data,
    first = value.start.index + attr(value.s.index[[1]], "match.length")[1], 
    last = value.end.index - 1
)


# Get Clubs
club.s.index <- gregexpr(pattern = "'verein':'", text = data)
club.e.index <- gregexpr(pattern = "',", text = data)

club.index <- rbind(
    cbind(club.s.index[[1]], 0),
    cbind(club.e.index[[1]], 1)
)

club.index <- club.index[order(club.index[, 1]), ]

c.index <- club.index[, 2] == 0
c.next.index <- c(FALSE, c.index[-length(c.index)])

club.start.index <- club.index[c.index, 1]
club.end.index <- club.index[c.next.index, 1]
club.res <- substring(
    text = data,
    first = club.start.index + attr(club.s.index[[1]], "match.length")[1], 
    last = club.end.index - 1
)
club.res <- gsub(pattern = "\\\\x20", replacement = " ", x = club.res)


# Get Age

age.s.index <- gregexpr(pattern = "'age':", text = data)
age.e.index <- gregexpr(pattern = ",", text = data)

age.index <- rbind(
    cbind(age.s.index[[1]], 0),
    cbind(age.e.index[[1]], 1)
)

age.index <- age.index[order(age.index[, 1]), ]

a.index <- age.index[, 2] == 0
a.next.index <- c(FALSE, a.index[-length(a.index)])

age.start.index <- age.index[a.index, 1]
age.end.index <- age.index[a.next.index, 1]
age.res <- substring(
    text = data,
    first = age.start.index + attr(age.s.index[[1]], "match.length")[1], 
    last = age.end.index - 1
)





res <- data.frame(
    Date = date.res,
    MarketValue = as.numeric(value.res),
    Club = club.res,
    Age = as.numeric(age.res)
)

Solution

  • Chart data is pulled from a different URL, https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259 . As it's JSON, we could parse it directly with jsonlite::read_json(url). Though they seem to be bit picky regarding user agent (jsonliterequest works in interactive RStudio session, gets 403 error when called through reprex), so let's just use httr2 for more convenient user agent config:

    library(httr2)
    request("https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/17259") |>
      req_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)") |>
      req_perform() |>
      resp_body_json(simplifyVector = TRUE) |>
      getElement(1) |>
      tibble::as_tibble()
    #> # A tibble: 43 × 7
    #>                x        y mw      datum_mw     verein            age   wappen   
    #>            <dbl>    <int> <chr>   <chr>        <chr>             <chr> <chr>    
    #>  1 1111014000000    75000 €75k    Mar 17, 2005 FC Schalke 04 U19 18    "https:/…
    #>  2 1126821600000   150000 €150k   Sep 16, 2005 FC Schalke 04     19    "https:/…
    #>  3 1156111200000   350000 €350k   Aug 21, 2006 FC Schalke 04     20    ""       
    #>  4 1168815600000  1500000 €1.50m  Jan 15, 2007 FC Schalke 04     20    ""       
    #>  5 1182376800000  3000000 €3.00m  Jun 21, 2007 FC Schalke 04     21    ""       
    #>  6 1199142000000  4500000 €4.50m  Jan 1, 2008  FC Schalke 04     21    ""       
    #>  7 1212530400000  7000000 €7.00m  Jun 4, 2008  FC Schalke 04     22    ""       
    #>  8 1232492400000  9000000 €9.00m  Jan 21, 2009 FC Schalke 04     22    ""       
    #>  9 1244584800000 11000000 €11.00m Jun 10, 2009 FC Schalke 04     23    ""       
    #> 10 1263682800000 15000000 €15.00m Jan 17, 2010 FC Schalke 04     23    ""       
    #> # ℹ 33 more rows
    

    Created on 2023-12-31 with reprex v2.0.2