I try to code for Scraping stock investment information by stock.
first step is to Create a folder to store the stock investment information if it doesn't already exist. and second step is to Read the web page and save it to a file. Last stage is to Read the saved file and creates a dataframe. Specifically, I want to retrieve the information within the red line on the same page as the image as a dataframe and save it as a CSV file. I wanted to collect the investment information of pure stocks excluding ETNs, ETFs, REITs, etc. from the kospi 200 index.
But code is not activated.
library(rvest)
library(stringr)
url <- "https://finance.naver.com/sise/sise_market_sum.nhn"
html <- read_html(url, encoding="euc-kr")
tables <- html %>%
html_nodes("table")
hrefs <- tables[2] %>%
html_nodes("a") %>%
html_attr("href")
hrefs
codes <- substr(hrefs, nchar(hrefs)-5, nchar(hrefs))
codes
code_list <- codes[c(TRUE, FALSE)]
get_investment_info <- function(url) {
webpage <- read_html(url)
# 투자정보 탭 클릭
invest_tab <- webpage %>%
html_nodes("a[href*=tab_invest]") %>%
html_attr("onclick") %>%
str_extract("(?<=showArea\\('tab_invest',)[^\\)]+(?=\\))") %>%
str_replace_all("'","") %>%
str_replace_all(" ","") %>%
str_split(",")
# 필요한 정보 스크랩핑
invest_info <- webpage %>%
html_node(paste0("#", invest_tab[[1]][1])) %>%
html_nodes(".aside_invest_table > tbody > tr") %>%
html_text() %>%
str_remove_all("[\r\n\t]") %>%
str_remove_all(",") %>%
str_trim() %>%
str_split(" ") %>%
unlist() %>%
matrix(ncol = 2, byrow = TRUE) %>%
as.data.frame() %>%
setNames(c("항목", "값")) %>%
filter(항목 %in% c("시가총액", "시가총액순위", "상장주식수", "액면가", "매매단위", "외국인한도주식수",
"외국인한도주식수(A)", "외국인보유주식수(B)", "외국인소진율(B/A)", "투자의견", "목표주가",
"52주최고", "52주최저", "PER", "EPS", "추정PER", "추정EPS", "PBR", "BPS",
"배당수익률", "동일업종PER", "동일업종등락률"))
return(invest_info)
}
url <- 0
invest_info <- 0
for (i in length(code_list)){
url[i]<- paste0("https://finance.naver.com/item/main.naver?code=", code_list[i])
html <- read_html(url, encoding = "euc-kr")
invest_info[i] <- get_investment_info(url[i])
}
write.csv(invest_info, "Korean_investment_info.csv", row.names = FALSE)
I want it to be modified well. Then activate well. Conclusionly, complete the R code to scrape stock investment information by stock and the scraping results in a CSV file.
library(tidyverse)
library(rvest)
# Scrape the main site to extract stocks and ID
main <- "https://finance.naver.com/sise/sise_market_sum.nhn" %>%
read_html(encoding = "euc-kr")
df <- tibble(
company = main %>%
html_elements(".tltle") %>%
html_text2(),
id = main %>%
html_elements(".tltle") %>%
html_attr("href") %>%
str_remove_all("[^0-9]")
)
# Scraper function
get_info <- function(id) {
cat("Scraping", id, "\n")
str_c("https://finance.naver.com/item/main.naver?code=", id) %>%
read_html(encoding = "euc-kr") %>%
html_element(".aside_invest_info") %>%
html_table() %>%
select(1:2) %>%
rename(category = 1, value = 2) %>%
slice(1:15)
}
# Scraping
df <- df %>%
mutate(stock_info = map(id, get_info))
# Final dataset
df
# Unnesting to get the info
df %>%
unnest(stock_info)
SKRT