Search code examples
python-3.xweb-scrapingurllib

How to get JSON that's in Vietnamese in English?


Open the link https://www.hsx.vn/Modules/Listed/Web/Symbols?fid=18b12d5d2d554559bf10eeb90304ff2e with browser and its tool inspect.Click ENGLISH buttom at the right corner,then `listing--listing list', the response json is in English.

The first element is as below whose content is in English.

{
    "cell": [
        624,
        "AAA",
        "VN000000AAA4",
        "BBG000BB42R4",
        "An Phat Bioplastics Joint Stock Company ",
        "382,274,496.00",
        "382,274,496.00",
        "10/6/2016"
    ]
}

enter image description here

I build a request according to the inspect-network with urllib.request.

enter image description here

import gzip,json,urllib.request
url_root = "https://www.hsx.vn/Modules/Listed/Web/SymbolList"
params = {    
    "pageFieldName1":"Code",
    "pageFieldValue1":"",
    "pageFieldValue2":"",
    "pageFieldOperator2":"",
    "pageFieldOperator3":"",
    "pageFieldValue4":"",
    "pageFieldOperator4":"",
    "pageFieldOperator1":"eq",
    "pageFieldName2":"Sectors",
    "pageFieldName3":"Sector",
    "pageFieldValue3":"00000000-0000-0000-0000-000000000000",
    "pageFieldName4":"StartWith",
    "pageCriteriaLength":"4",
    "_search":"false",
    "rows":10,
    "page":"1",
    "sidx":"id",
    "sord":"desc"
}
query_string = urllib.parse.urlencode( params )
url = url_root + "?" + query_string
headers = { 
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"en-GB,en-US;q=0.9,en;q=0.8",
    "Cache-Control":"max-age=0",
    "Connection":"keep-alive",
    "X-Requested-With":"XMLHttpRequest",
    "Host":"www.hsx.vn",
    "Accept":"application/json, text/javascript, */*; q=0.01",
    "Sec-Fetch-Dest":"empty",
    "Sec-Fetch-Mode":"cors",
    "Sec-Fetch-Site":"same-origin",
    "Referer":"https://www.hsx.vn/Modules/Listed/Web/Symbols?fid=18b12d5d2d554559bf10eeb90304ff2e",
    "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/106.0.0.0 Safari/537.36"
}
req = urllib.request.Request(url=url,method='GET',headers=headers)
with urllib.request.urlopen(req) as response:
    response_text = response.read()
    content = gzip.decompress(response_text)
    data = content.decode('utf-8')
    data = json.loads(data)
    data = data['rows']

The first element i got is in Vietnamese:

data[0]
{'id': 624, 'cell': [624, 'AAA', 'VN000000AAA4', 'BBG000BB42R4', 'Công ty Cổ phần Nhựa An Phát Xanh', '382.274.496,00', '382.274.496,00', '06/10/2016']}

How can get the JSON in English?


Solution

  • You must change the language first and then request the JSON.

    Here's how to do it:

    from urllib.parse import urlencode
    
    import requests
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    payload = {
        "pageFieldName1": "Code",
        "pageFieldValue1": "",
        "pageFieldValue2": "",
        "pageFieldOperator2": "",
        "pageFieldOperator3": "",
        "pageFieldValue4": "",
        "pageFieldOperator4": "",
        "pageFieldOperator1": "eq",
        "pageFieldName2": "Sectors",
        "pageFieldName3": "Sector",
        "pageFieldValue3": "00000000-0000-0000-0000-000000000000",
        "pageFieldName4": "StartWith",
        "pageCriteriaLength": "4",
        "_search": "false",
        "rows": 30,
        "page": "1",
        "sidx": "id",
        "sord": "desc"
    }
    
    endpoint = "https://www.hsx.vn/Modules/Listed/Web/SymbolList?"
    en_url = "https://www.hsx.vn/Common/ChangeLanguage/9e054dac-a75b-423f-95f6-54d3f73d4e53"
    
    with requests.Session() as s:
        change_language = s.get(en_url, headers=headers)
        headers.update({"X-Requested-With": "XMLHttpRequest"})
        data = (
            s.get(f"{endpoint}{urlencode(payload)}", headers=headers)
            .json()["rows"]
        )
        print(data[0]["cell"][4])
    

    This should output:

    An Phat Bioplastics Joint Stock Company