Search code examples
pythonrweb-scraping

How to scrape the web table with multiple pages using R or Python


I want to scrape a web to gather the data for studying data mining. This web data contains a big table with 43 pages. And it also hide some stocks at the most right hand side of the expand menu.

enter image description here

The web page is below.

http://data.10jqka.com.cn/market/longhu/yyb/

import bs4
import requests


url = r"http://data.10jqka.com.cn/market/longhu/yyb/"

response = requests.get(url)
if response.status_code == 200:
    content = response.content

soup = bs4.BeautifulSoup(content)
table_results = soup.findAll("table", {"class": "m_table"})
for item in table_results:
    company_name = item.findAll("td", {"class": "tl"})[0].text.strip()
    detail = item.findAll("td", {"class": "tc"})[0].text.strip()
    c_rise = item.findAll("td", {"class": "c_rise"})[0].text.strip()
    c_fall = item.findAll("td", {"class": "c_fall"})[0].text.strip()
    cur = item.findAll("td", {"class": "cur"})[0].text.strip()
    lhb_stocklist = item.findAll("div", {"class": "lhb_stocklist"})[0].text.strip()
    print company_name, detail, c_rise, c_fall, lhb_stocklist

Solution

  • A solution based on requests, BeautifulSoup, and lxml:

    import json
    import requests
    from bs4 import BeautifulSoup
    
    URL = 'http://data.10jqka.com.cn/interface/market/longhuyyb/stocknum/desc/%d/20'
    # config end_page as needed, or parse http://data.10jqka.com.cn/market/longhu/yyb/ to make it auto adapted
    end_page = 2
    
    result = []
    for page_idx in range(1, end_page + 1):
        print 'Extracting page', page_idx
        raw_response = requests.get(URL % page_idx)
        page_content = json.loads(raw_response.text)['data']
        html = BeautifulSoup(page_content, 'lxml')
        for row in html.tbody.find_all('tr'):
            company = row.find(class_='tl').text
            detail_link = row.find(class_='tl').a['href']
            buy = float(row.find(class_='c_rise').text)
            sell = float(row.find(class_='c_fall').text)
            stock_cnt = int(row.find(class_='cur').text)
            stocks = []
            for a in row.find(class_='lhb_stocklist_box hide').p.find_all('a'):
                stocks.append((a.text, a['href']))
            result.append({
                'company': company,
                'detail_link': detail_link,
                'buy': buy,
                'sell': sell,
                'stock_cnt': stock_cnt,
                'stocks': stocks,
            })
    
    print 'Company number:', len(result)
    

    I put all data into a list of dictionaries, for easy accessing. You can modify the codes to directly write to a CSV or whatever