Search code examples
htmlrrvest

Scraping WSJ.com using Rvest


I'm trying to scrape the first table in the Wall Street Journal Markets Diary page ('https://www.wsj.com/market-data/stocks/marketsdiary') and it seems that rvest is not picking up the table in the html. Here's my code:

require(rvest)
require(dplyr)

wsj_html <- read_html('https://www.wsj.com/market-data/stocks/marketsdiary')
nyse_tbl = wsj_html %>% html_nodes('body') %>% html_nodes('div') %>% html_nodes('table') %>% html_nodes('tbody')

Can anyone tell me what I'm missing here? Thanks.


Solution

  • This can be done with httr2 or jsonlite to get the source.

    library(tidyverse)
    library(jsonlite)
    
    'https://www.wsj.com/market-data/stocks/marketsdiary?id={"application":"WSJ","marketsDiaryType":"diaries"}&type=mdc_marketsdiary' %>%
      fromJSON() %>% 
      .$data %>%  
      as_tibble() %>%  
      unnest(instrumentSets) %>%  
      unnest(headerFields) %>%  
      unnest(instruments) 
    
    # A tibble: 212 x 8
       value label id              name                 latestClose previousClose weekAgo     timestamp            
       <chr> <chr> <chr>           <chr>                <chr>       <chr>         <chr>       <chr>                
     1 name  NYSE  issuestraded    Issues traded        3,385       3,407         3,344       Friday, July 08, 2022
     2 name  NYSE  advances        Advances             1,497       2,494         2,412       Friday, July 08, 2022
     3 name  NYSE  declines        Declines             1,708       752           805         Friday, July 08, 2022
     4 name  NYSE  unchanged       Unchanged            180         161           127         Friday, July 08, 2022
     5 name  NYSE  newhighs        New highs            7           11            12          Friday, July 08, 2022
     6 name  NYSE  newlows         New lows             56          59            205         Friday, July 08, 2022
     7 name  NYSE  advvolume       Adv. volume*         283,922,224 785,792,562   693,314,370 Friday, July 08, 2022
     8 name  NYSE  declvolume      Decl. volume*        483,599,683 104,114,907   203,789,292 Friday, July 08, 2022
     9 name  NYSE  totalvolume     Total volume*        777,515,857 892,767,579   919,812,639 Friday, July 08, 2022
    10 name  NYSE  closingarmstrin Closing Arms (TRIN)† 1.20        0.38          1.02        Friday, July 08, 2022
    # ... with 202 more rows
    

    Alternatively, you can do it with RSelenium to get the tables as they are displayed on the website.

    library(tidyverse)
    library(RSelenium)
    library(netstat)
    
    rD <- rsDriver(browser = "firefox", port = free_port())
    remDr <- rD[["client"]]
    
    
    remDr$navigate("https://www.wsj.com/market-data/stocks/marketsdiary")
    source <- remDr$getPageSource()[[1]]
    
    source %>%
      read_html() %>%  
      html_table()
    
    [[1]]
    # A tibble: 57 x 4
       ``           `Latest Close` `Previous Close` `Week Ago`
       <chr>        <chr>          <chr>            <chr>     
     1 NYSE         Latest Close   Previous Close   Week Ago  
     2 Issues trad~ 3,385          3,407            3,344     
     3 Advances     1,497          2,494            2,412     
     4 Declines     1,708          752              805       
     5 Unchanged    180            161              127       
     6 New highs    7              11               12        
     7 New lows     56             59               205       
     8 Adv. volume* 283,922,224    785,792,562      693,314,3~
     9 Decl. volum~ 483,599,683    104,114,907      203,789,2~
    10 Total volum~ 777,515,857    892,767,579      919,812,6~
    # ... with 47 more rows
    
    [[2]]
    # A tibble: 15 x 4
       ``           `Latest Close` `Previous Close` `Week Ago`
       <chr>        <chr>          <chr>            <chr>     
     1 9:30 to 10:~ 96,773,796     118,483,423      112,076,7~
     2 10:00 to 10~ 43,263,747     48,247,363       56,369,510
     3 10:30 to 11~ 42,167,828     43,369,860       52,631,721
     4 11:00 to 11~ 31,750,172     37,432,262       41,812,829
     5 11:30 to 12~ 30,210,840     30,206,282       33,446,092
     6 12:00 to 12~ 26,258,430     26,547,120       28,841,126
     7 12:30 to 1:~ 26,284,237     28,346,311       28,485,375
     8 1:00 to 1:30 23,472,776     25,659,220       23,808,690
     9 1:30 to 2:00 22,194,379     24,886,867       29,124,176
    10 2:00 to 2:30 22,060,112     29,228,153       30,130,702
    11 2:30 to 3:00 23,046,855     28,545,060       28,604,043
    12 3:00 to 3:30 29,926,731     34,514,543       35,432,016
    13 3:30 to 4:00 360,105,954    417,301,115      419,049,6~
    14 Total        777,515,857    892,767,579      919,812,6~
    15 Composite    3,528,791,070  4,143,550,789    4,058,101~
    
    [[3]]
    # A tibble: 13 x 2
       NYSE          `Weekly Totals`
       <chr>         <chr>          
     1 Issues Traded 3,596          
     2 Advances      1,895          
     3 Declines      1,583          
     4 Unchanged     118            
     5 New Highs     26             
     6 New Lows      439            
     7 Adv Vol       8,276,302,604  
     8 Decl Vol      8,778,273,923  
     9 Total Vol     17,281,584,001 
    10 zAdv Vol      1,804,202,853  
    11 zDecl Vol     2,062,552,502  
    12 zTotal Vol    3,906,549,612  
    13 zBlock trades 17,798         
    
    [[4]]
    # A tibble: 10 x 2
       NASDAQ        `Weekly Totals`
       <chr>         <chr>          
     1 Issues Traded 5,443          
     2 Advances      3,360          
     3 Declines      1,842          
     4 Unchanged     241            
     5 New Highs     90             
     6 New Lows      734            
     7 Adv Vol       12,698,953,908 
     8 Decl Vol      6,133,314,084  
     9 Total Vol     19,061,277,371 
    10 Block Trades  129,878        
    
    [[5]]
    # A tibble: 13 x 2
       `NYSE American` `Weekly Totals`
       <chr>           <chr>          
     1 Issues Traded   304            
     2 Advances        157            
     3 Declines        135            
     4 Unchanged       12             
     5 New Highs       0              
     6 New Lows        54             
     7 Adv Vol         598,998,905    
     8 Decl Vol        333,274,395    
     9 Total Vol       943,830,187    
    10 zAdv Vol        33,061,561     
    11 zDecl Vol       25,092,175     
    12 zTotal Vol      58,995,872     
    13 zBlock trades   598            
    
    [[6]]
    # A tibble: 13 x 2
       `NYSE Arca`   `Weekly Totals`
       <chr>         <chr>          
     1 Issues Traded 1,917          
     2 Advances      1,087          
     3 Declines      813            
     4 Unchanged     17             
     5 New Highs     16             
     6 New Lows      450            
     7 Adv Vol       3,060,741,851  
     8 Decl Vol      3,291,369,981  
     9 Total Vol     6,456,858,700  
    10 zAdv Vol      561,643,211    
    11 zDecl Vol     598,770,457    
    12 zTotal Vol    1,176,719,093  
    13 zBlock trades 4,673