python web-scraping post python-requests

Replicating the same POST request (inspected in DeveloperTools) fails to scrape data

I'm trying to grab the tabular data from this page without using BeautifulSoup to parse the resulted HTML: https://finance.vietstock.vn/ket-qua-giao-dich/vietnam.aspx?tab=thong-ke-gia&exchange=1&code=-19

Looking at DeveloperTools, I found the script KQGDThongKeGiaStockPaging, apparently it's a POST request to the url: https://finance.vietstock.vn/data/KQGDThongKeGiaStockPaging (which shows nothing if I just copy/paste to a browser).

I copy the request header, the payload, run the POST request - and I always get back a response whose text is the content of the url https://finance.vietstock.vn/data/KQGDThongKeGiaStockPaging which basically says: there's nothing here.

Tried ChatGPT, searched stackoverflow questions. Wondered if I couldn't do it because the data was dynamically loaded (I'm new to this so don't really understand the term) and I might have needed a VerificationToken on the fly. So I tried - in one session - to make a request to the url to get the token, copy that to the payload and run again - still didn't work (always get the content there's nothing here).

I know I can use BeautifulSoup to read the table from HTML or even selenium, but would like to keep it simple with GET/POST request.

import requests
from bs4 import BeautifulSoup

# Function to extract __RequestVerificationToken from the page
def extract_verification_token(html):
    soup = BeautifulSoup(html, 'html.parser')
    token_input = soup.find('input', {'name': '__RequestVerificationToken'})
    if token_input:
        return token_input['value']
    return None

# URL and initial payload
url = "https://finance.vietstock.vn/data/KQGDThongKeGiaStockPaging"
initial_payload = {
    "page": 1,
    "pageSize": 20,
    "catID": 1,
    "stockID": -19,
    "fromDate": "2023-12-07",
    "toDate": "2024-01-07"
}

# Add headers to simulate a legitimate browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Referer": "https://finance.vietstock.vn/ket-qua-giao-dich/vietnam.aspx?tab=thong-ke-gia&exchange=1&code=-19",
}

# Make the initial GET request with headers
response = requests.get("https://finance.vietstock.vn/ket-qua-giao-dich/vietnam.aspx?tab=thong-ke-gia&exchange=1&code=-19", headers=headers)
if response.status_code == 200:
    verification_token = extract_verification_token(response.text)
    if verification_token:
        # Update the payload with the verification token
        initial_payload['__RequestVerificationToken'] = verification_token

        # Make the actual POST request with the updated payload and headers
        response = requests.post(url, data=initial_payload, headers=headers)

        if response.status_code == 200:
            data = response.json()
            # Now 'data' contains the tabular data, and you can process it as needed
            print(data)
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
    else:
        print("Verification token not found.")
else:
    print(f"Error fetching the page: {response.status_code}")

Solution

To get the data from the site you need to get the cookies from the server plus get the verification cookie from the HTML too:

import requests
from bs4 import BeautifulSoup

url = "https://finance.vietstock.vn/ket-qua-giao-dich/vietnam.aspx?tab=thong-ke-gia&exchange=1&code=-19"
api_url = "https://finance.vietstock.vn/data/KQGDThongKeGiaStockPaging"

with requests.session() as s:
    s.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
        }
    )

    soup = BeautifulSoup(s.get(url).content, "html.parser")
    token = soup.select_one('input[name="__RequestVerificationToken"]')["value"]

    payload = {
        "page": "1",
        "pageSize": "20",
        "catID": "1",
        "stockID": "-19",
        "fromDate": "2023-12-07",
        "toDate": "2024-01-07",
        "__RequestVerificationToken": token,
    }

    data = s.post(api_url, data=payload).json()
    print(data)

Prints:

[[{'CloseIndex': 1154.68, 'PriorIndex': 1150.72, 'Change': 3.96, 

...