Search code examples
pythonweb-scrapingencodingutf-8

I want to scrape news from latin site but some letters are bugged


This is my code

import requests
from bs4 import BeautifulSoup

def xeberi_oxu(url):
    request = requests.get(url)
    bs4 = BeautifulSoup(request.content, "html.parser")
    news_content = {}
    news_content['title'] = bs4.find("h1", class_='full-post-title').text
    news_content['content'] = bs4.find("div", class_='full-post-article').text
    
    with open(output_file, 'a', encoding='utf8') as file:
        file.write(f"Başlıq:{news_content['title']}\n")
        file.write(f"{news_content['content']}\n\n")

url = "https://xebertv.info/"
output_file = 'xeberler.txt'
request = requests.get(url)
bs4 = BeautifulSoup(request.content, "html.parser")
xeberler = bs4.find_all("div", class_="last-posts-list")

for xeber in xeberler:
    links = xeber.find_all("a")
    for link in links:
        href = link.get("href")
        xeberi_oxu(href)

When I run it some letters on output file aren't normal

I tried to change encoding to UTF-8 as you see on code but it doesn't works


Solution

  • Try to explicitly decode the request.content to utf-8:

    import requests
    from bs4 import BeautifulSoup
    
    
    def xeberi_oxu(url):
        request = requests.get(url)
    
        bs4 = BeautifulSoup(request.content.decode('utf-8'), "html.parser")    # <--- decode the content here
        news_content = {}
        news_content["title"] = bs4.find("h1", class_="full-post-title").text
        news_content["content"] = bs4.find("div", class_="full-post-article").text
    
        return f"Başlıq:{news_content['title']}\n{news_content['content']}"
    
    
    url = "https://xebertv.info/"
    output_file = "xeberler.txt"
    request = requests.get(url)
    bs4 = BeautifulSoup(request.content, "html.parser")
    xeberler = bs4.find_all("div", class_="last-posts-list")
    
    out = []
    for xeber in xeberler:
        links = xeber.find_all("a")
        for link in links:
            href = link.get("href")
            out.append(xeberi_oxu(href))
    
    with open(output_file, "w", encoding="utf8") as f_out:
        print('\n\n'.join(out), file=f_out)
    

    Prints the characters correctly:

    
    ....
    
    Başlıq: Sabah 32 dərəcə isti olacaq  
    
    ...