Search code examples
rdataframeweb-scrapingexport-to-csvstock

how to webscraping through stock webpage


I try to code for Scraping stock investment information by stock. first step is to Create a folder to store the stock investment information if it doesn't already exist. and second step is to Read the web page and save it to a file. Last stage is to Read the saved file and creates a dataframe.enter image description here Specifically, I want to retrieve the information within the red line on the same page as the image as a dataframe and save it as a CSV file. I wanted to collect the investment information of pure stocks excluding ETNs, ETFs, REITs, etc. from the kospi 200 index. But code is not activated.

library(rvest)
library(stringr)

url <- "https://finance.naver.com/sise/sise_market_sum.nhn"
html <- read_html(url, encoding="euc-kr")

tables <- html %>%
  html_nodes("table")

hrefs <- tables[2] %>%
  html_nodes("a") %>%
  html_attr("href")

hrefs 

codes <- substr(hrefs, nchar(hrefs)-5, nchar(hrefs))

codes 

code_list <- codes[c(TRUE, FALSE)]

get_investment_info <- function(url) {
  webpage <- read_html(url)
  
  # 투자정보 탭 클릭
  invest_tab <- webpage %>%
    html_nodes("a[href*=tab_invest]") %>%
    html_attr("onclick") %>%
    str_extract("(?<=showArea\\('tab_invest',)[^\\)]+(?=\\))") %>%
    str_replace_all("'","") %>%
    str_replace_all(" ","") %>%
    str_split(",")
  
  # 필요한 정보 스크랩핑
  invest_info <- webpage %>%
    html_node(paste0("#", invest_tab[[1]][1])) %>%
    html_nodes(".aside_invest_table > tbody > tr") %>%
    html_text() %>%
    str_remove_all("[\r\n\t]") %>%
    str_remove_all(",") %>%
    str_trim() %>%
    str_split(" ") %>%
    unlist() %>%
    matrix(ncol = 2, byrow = TRUE) %>%
    as.data.frame() %>%
    setNames(c("항목", "값")) %>%
    filter(항목 %in% c("시가총액", "시가총액순위", "상장주식수", "액면가", "매매단위", "외국인한도주식수",
                        "외국인한도주식수(A)", "외국인보유주식수(B)", "외국인소진율(B/A)", "투자의견", "목표주가",
                        "52주최고", "52주최저", "PER", "EPS", "추정PER", "추정EPS", "PBR", "BPS",
                        "배당수익률", "동일업종PER", "동일업종등락률"))
  
  return(invest_info)
}

url <- 0
invest_info <- 0

for (i in length(code_list)){
 url[i]<- paste0("https://finance.naver.com/item/main.naver?code=", code_list[i])
 html <- read_html(url, encoding = "euc-kr")
 invest_info[i] <- get_investment_info(url[i])
}     

write.csv(invest_info, "Korean_investment_info.csv", row.names = FALSE)

I want it to be modified well. Then activate well. Conclusionly, complete the R code to scrape stock investment information by stock and the scraping results in a CSV file.


Solution

  • library(tidyverse)
    library(rvest)
    
    # Scrape the main site to extract stocks and ID
    main <- "https://finance.naver.com/sise/sise_market_sum.nhn" %>%
      read_html(encoding = "euc-kr")
    
    df <- tibble(
      company = main %>%
        html_elements(".tltle") %>%
        html_text2(),
      id = main %>%
        html_elements(".tltle") %>%
        html_attr("href") %>% 
        str_remove_all("[^0-9]")
    )
    
    # Scraper function
    get_info <- function(id) {
      cat("Scraping", id, "\n")
      str_c("https://finance.naver.com/item/main.naver?code=", id) %>% 
        read_html(encoding = "euc-kr") %>% 
        html_element(".aside_invest_info") %>% 
        html_table() %>% 
        select(1:2) %>% 
        rename(category = 1, value = 2) %>% 
        slice(1:15) 
    }
    
    # Scraping
    df <- df %>% 
      mutate(stock_info = map(id, get_info))
    
    # Final dataset
    df
    
    # Unnesting to get the info
    df %>% 
      unnest(stock_info)
    
    SKRT