I'm scrapping data from a site in French. I'm using MS XML 6.0, some letters are not being recognized correctly (é for example)
Code:
Dim xml_obj As XMLHTTP
Set xml_obj = New XMLHTTP
xml_obj.Open "GET", "http://www.emploi.nat.tn/fo/Fr/global.php?page=146&menu1=&FormLinks_Sorting=1&FormLinks_Sorted=&num_page=0&limit=500&numpage=1", False
xml_obj.send
Dim htmldoc As New HTMLDocument
htmldoc.body.innerHTML = xml_obj.responseText
responseText is coded in UTF-8. Any workaround?
You first need to decode the page since the encoding is windows-1256
. Then write the html directly in the document and not in the body:
Sub UsageExample()
Dim req As New MSXML2.ServerXMLHTTP60 ' Microsoft XML, v6.0 '
req.Open "GET", "http://www.emploi.nat.tn/fo/Fr/global.php?page=146&menu1=&FormLinks_Sorting=1&FormLinks_Sorted=&num_page=0&limit=500&numpage=1", False
req.Send
Dim doc As New MSHTML.HTMLDocument ' Microsoft HTML Object Library '
WriteDocument doc, req.responseBody, "windows-1256"
End Sub
Private Sub WriteDocument(document As Object, data, charset As String)
Dim stream As New ADODB.stream ' Microsoft ActiveX Data Objects 6.1 Library '
stream.Open
stream.Type = 1
stream.Write data
stream.Position = 0
stream.Type = 2
stream.charset = charset
document.Open
document.Write stream.ReadText
document.Close
stream.Close
End Sub