python dataframe web-scraping nlp arabic

Web scraping python for Arabic text

I am trying to web Scrape the website: "http://norumors.net/?post_type=rumors?post_type=rumors" to get only the heading news and put them in a CSV file using Beautifulsoup and python, This is the code I am using after i look into the HTML source code "view-source:http://norumors.net/?post_type=rumors?post_type=rumors"

import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd

pagesToGet= 1

upperframe=[]  
for page in range(1,pagesToGet+1):
    print('processing page :', page)
    url = 'http://norumors.net/?post_type=rumors/?page='+str(page)
    print(url)
    
    #an exception might be thrown, so the code should be in a try-except block
    try:
        #use the browser to get the url. This is suspicious command that might blow up.
        page=requests.get(url)                             # this might throw an exception if something goes wrong.
    
    except Exception as e:                                   # this describes what to do if an exception is thrown
        error_type, error_obj, error_info = sys.exc_info()      # get the exception information
        print ('ERROR FOR LINK:',url)                          #print the link that cause the problem
        print (error_type, 'Line:', error_info.tb_lineno)     #print error info and line that threw the exception
        continue                                              #ignore this page. Abandon this and go back.
    time.sleep(2)   
    soup=BeautifulSoup(page.text,'html.parser')
    frame=[]
    links=soup.find_all('li',attrs={'class':'o-listicle__item'})
    print(len(links))
    filename="NEWS.csv"
    f=open(filename,"w", encoding = 'utf-8')
    headers="Statement,Link\n"
    f.write(headers)
    
    for j in links:
        Statement = j.find("div",attrs={'class':'row d-flex'}).text.strip()
       # Link = "http://norumors.net/"
        Link += j.find("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'}).find('a')['href'].strip()
        frame.append((Statement,Link))
        f.write(Statement.replace(",","^")+","+Link+","+Date.replace(",","^")+","+Source.replace(",","^")+","+Label.replace(",","^")+"\n")
    upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Statement','Link'])
data.head()

but After I run the code I am getting the pandas data frame and CSV file empty, any suggestion why is that? knowing that i want to get the text between tags.

Solution

If I understand correctly, you want to get the text part of the news headlines and the href link to these news. You further want to write them into a CSV file. The problem with your code is for j in links: is not executed because soup.find_all('li',attrs={'class':'o-listicle__item'}) returns an empty list. You should be careful with the names and classes of the tags that you are searching. Below code gets news texts and their links, it also writes them to the CSV file using pd.DataFrame.

import urllib.request,sys,time
from bs4 import BeautifulSoup 
import requests
import pandas as pd

pagesToGet = 1

for page in range(1,pagesToGet+1):
    print('processing page :', page)
    url = 'http://norumors.net/?post_type=rumors/?page=' + str(page)
    print(url)

    #an exception might be thrown, so the code should be in a try-except block
    try:
        #use the browser to get the url. This is suspicious command that might blow up.
        page = requests.get(url)                             # this might throw an exception if something goes wrong.

    except Exception as e:                                   # this describes what to do if an exception is thrown
        error_type, error_obj, error_info = sys.exc_info()      # get the exception information
        print('ERROR FOR LINK:',url)                          #print the link that cause the problem
        print(error_type, 'Line:', error_info.tb_lineno)     #print error info and line that threw the exception
        continue                                              #ignore this page. Abandon this and go back.

    soup = BeautifulSoup(page.text,'html.parser')
    texts = []
    links = []
    filename = "NEWS.csv"
    f = open(filename,"w", encoding = 'utf-8')

    Statement = soup.find("div",attrs={'class':'row d-flex'})
    divs = Statement.find_all("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'})

    for div in divs:
        txt = div.find("img",attrs={'class':'rumor__thumb'})
        texts.append(txt['alt'])
        lnk = div.find("a",attrs={'class':'rumor--archive'})
        links.append(lnk['href'])

data = pd.DataFrame(list(zip(texts, links)), columns=['Statement', 'Link'])
data.to_csv(f, encoding='utf-8', index=False)
f.close()