r dataframe web-scraping export-to-csv stock

how to webscraping through stock webpage

I try to code for Scraping stock investment information by stock. first step is to Create a folder to store the stock investment information if it doesn't already exist. and second step is to Read the web page and save it to a file. Last stage is to Read the saved file and creates a dataframe. Specifically, I want to retrieve the information within the red line on the same page as the image as a dataframe and save it as a CSV file. I wanted to collect the investment information of pure stocks excluding ETNs, ETFs, REITs, etc. from the kospi 200 index. But code is not activated.

library(rvest)
library(stringr)

url <- "https://finance.naver.com/sise/sise_market_sum.nhn"
html <- read_html(url, encoding="euc-kr")

tables <- html %>%
  html_nodes("table")

hrefs <- tables[2] %>%
  html_nodes("a") %>%
  html_attr("href")

hrefs 

codes <- substr(hrefs, nchar(hrefs)-5, nchar(hrefs))

codes 

code_list <- codes[c(TRUE, FALSE)]

get_investment_info <- function(url) {
  webpage <- read_html(url)
  
  # 투자정보 탭 클릭
  invest_tab <- webpage %>%
    html_nodes("a[href*=tab_invest]") %>%
    html_attr("onclick") %>%
    str_extract("(?<=showArea\\('tab_invest',)[^\\)]+(?=\\))") %>%
    str_replace_all("'","") %>%
    str_replace_all(" ","") %>%
    str_split(",")
  
  # 필요한 정보 스크랩핑
  invest_info <- webpage %>%
    html_node(paste0("#", invest_tab[[1]][1])) %>%
    html_nodes(".aside_invest_table > tbody > tr") %>%
    html_text() %>%
    str_remove_all("[\r\n\t]") %>%
    str_remove_all(",") %>%
    str_trim() %>%
    str_split(" ") %>%
    unlist() %>%
    matrix(ncol = 2, byrow = TRUE) %>%
    as.data.frame() %>%
    setNames(c("항목", "값")) %>%
    filter(항목 %in% c("시가총액", "시가총액순위", "상장주식수", "액면가", "매매단위", "외국인한도주식수",
                        "외국인한도주식수(A)", "외국인보유주식수(B)", "외국인소진율(B/A)", "투자의견", "목표주가",
                        "52주최고", "52주최저", "PER", "EPS", "추정PER", "추정EPS", "PBR", "BPS",
                        "배당수익률", "동일업종PER", "동일업종등락률"))
  
  return(invest_info)
}

url <- 0
invest_info <- 0

for (i in length(code_list)){
 url[i]<- paste0("https://finance.naver.com/item/main.naver?code=", code_list[i])
 html <- read_html(url, encoding = "euc-kr")
 invest_info[i] <- get_investment_info(url[i])
}     

write.csv(invest_info, "Korean_investment_info.csv", row.names = FALSE)

I want it to be modified well. Then activate well. Conclusionly, complete the R code to scrape stock investment information by stock and the scraping results in a CSV file.

Solution

library(tidyverse)
library(rvest)

# Scrape the main site to extract stocks and ID
main <- "https://finance.naver.com/sise/sise_market_sum.nhn" %>%
  read_html(encoding = "euc-kr")

df <- tibble(
  company = main %>%
    html_elements(".tltle") %>%
    html_text2(),
  id = main %>%
    html_elements(".tltle") %>%
    html_attr("href") %>% 
    str_remove_all("[^0-9]")
)

# Scraper function
get_info <- function(id) {
  cat("Scraping", id, "\n")
  str_c("https://finance.naver.com/item/main.naver?code=", id) %>% 
    read_html(encoding = "euc-kr") %>% 
    html_element(".aside_invest_info") %>% 
    html_table() %>% 
    select(1:2) %>% 
    rename(category = 1, value = 2) %>% 
    slice(1:15) 
}

# Scraping
df <- df %>% 
  mutate(stock_info = map(id, get_info))

# Final dataset
df

# Unnesting to get the info
df %>% 
  unnest(stock_info)

SKRT