How to scrape the web table with multiple pages using R or Python

I want to scrape a web to gather the data for studying data mining. This web data contains a big table with 43 pages. And it also hide some stocks at the most right hand side of the expand menu.

enter image description here

The web page is below.

http://data.10jqka.com.cn/market/longhu/yyb/

import bs4
import requests


url = r"http://data.10jqka.com.cn/market/longhu/yyb/"

response = requests.get(url)
if response.status_code == 200:
    content = response.content

soup = bs4.BeautifulSoup(content)
table_results = soup.findAll("table", {"class": "m_table"})
for item in table_results:
    company_name = item.findAll("td", {"class": "tl"})[0].text.strip()
    detail = item.findAll("td", {"class": "tc"})[0].text.strip()
    c_rise = item.findAll("td", {"class": "c_rise"})[0].text.strip()
    c_fall = item.findAll("td", {"class": "c_fall"})[0].text.strip()
    cur = item.findAll("td", {"class": "cur"})[0].text.strip()
    lhb_stocklist = item.findAll("div", {"class": "lhb_stocklist"})[0].text.strip()
    print company_name, detail, c_rise, c_fall, lhb_stocklist

Solution

A solution based on requests, BeautifulSoup, and lxml:

import json
import requests
from bs4 import BeautifulSoup

URL = 'http://data.10jqka.com.cn/interface/market/longhuyyb/stocknum/desc/%d/20'
# config end_page as needed, or parse http://data.10jqka.com.cn/market/longhu/yyb/ to make it auto adapted
end_page = 2

result = []
for page_idx in range(1, end_page + 1):
    print 'Extracting page', page_idx
    raw_response = requests.get(URL % page_idx)
    page_content = json.loads(raw_response.text)['data']
    html = BeautifulSoup(page_content, 'lxml')
    for row in html.tbody.find_all('tr'):
        company = row.find(class_='tl').text
        detail_link = row.find(class_='tl').a['href']
        buy = float(row.find(class_='c_rise').text)
        sell = float(row.find(class_='c_fall').text)
        stock_cnt = int(row.find(class_='cur').text)
        stocks = []
        for a in row.find(class_='lhb_stocklist_box hide').p.find_all('a'):
            stocks.append((a.text, a['href']))
        result.append({
            'company': company,
            'detail_link': detail_link,
            'buy': buy,
            'sell': sell,
            'stock_cnt': stock_cnt,
            'stocks': stocks,
        })

print 'Company number:', len(result)

I put all data into a list of dictionaries, for easy accessing. You can modify the codes to directly write to a CSV or whatever