I want to scrape news from latin site but some letters are bugged

This is my code

import requests
from bs4 import BeautifulSoup

def xeberi_oxu(url):
    request = requests.get(url)
    bs4 = BeautifulSoup(request.content, "html.parser")
    news_content = {}
    news_content['title'] = bs4.find("h1", class_='full-post-title').text
    news_content['content'] = bs4.find("div", class_='full-post-article').text
    
    with open(output_file, 'a', encoding='utf8') as file:
        file.write(f"Başlıq:{news_content['title']}\n")
        file.write(f"{news_content['content']}\n\n")

url = "https://xebertv.info/"
output_file = 'xeberler.txt'
request = requests.get(url)
bs4 = BeautifulSoup(request.content, "html.parser")
xeberler = bs4.find_all("div", class_="last-posts-list")

for xeber in xeberler:
    links = xeber.find_all("a")
    for link in links:
        href = link.get("href")
        xeberi_oxu(href)

When I run it some letters on output file aren't normal

I tried to change encoding to UTF-8 as you see on code but it doesn't works

Solution

Try to explicitly decode the request.content to utf-8:

import requests
from bs4 import BeautifulSoup


def xeberi_oxu(url):
    request = requests.get(url)

    bs4 = BeautifulSoup(request.content.decode('utf-8'), "html.parser")    # <--- decode the content here
    news_content = {}
    news_content["title"] = bs4.find("h1", class_="full-post-title").text
    news_content["content"] = bs4.find("div", class_="full-post-article").text

    return f"Başlıq:{news_content['title']}\n{news_content['content']}"


url = "https://xebertv.info/"
output_file = "xeberler.txt"
request = requests.get(url)
bs4 = BeautifulSoup(request.content, "html.parser")
xeberler = bs4.find_all("div", class_="last-posts-list")

out = []
for xeber in xeberler:
    links = xeber.find_all("a")
    for link in links:
        href = link.get("href")
        out.append(xeberi_oxu(href))

with open(output_file, "w", encoding="utf8") as f_out:
    print('\n\n'.join(out), file=f_out)

Prints the characters correctly:


....

Başlıq: Sabah 32 dərəcə isti olacaq  

...