Search code examples
javascripthtmlbeautifulsouppyqt4python-3.6

How to parse the text from an anchor tag?


I want to parse this "<a href="javascript:8==99999?popDuelloDialog(2754288):popTeam(2386)">Gnistan</a>" and extract the text.

I tried to extract a lot, but I couldn't succeed.

I don't know how to build a method with this format "javascript comes ":(numbers)" which are not repeating. So I need such a method that will only use the repeating part and will extract text in the body.

My code is here:

import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
import bs4 as bs
import urllib.request
import re
from bs4 import BeautifulSoup

class Client(QWebPage):

    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.loadFinished.connect(self.on_page_load)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def on_page_load(self):
        self.app.quit()

url = 'http://www.mackolik.com/Genis-Iddaa-Programi'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'html.parser')
#pattern=re.compile(r"javascript:;")
#js_test = soup.find_all('a', href='javascript')
hreff=soup.find_all("a","javascript:;")
#js_test=soup.select('a[href^="javascript:\('(.*?)'\);"]')
#print(js_test.text)
#type(href)
for i in hreff:
    print(hreff[i])

Solution

  • IIUC all you need is to make BeautifulSoup get all anchors tag that have the "javascript" in their href attribute. However, it seems that the content you want to parse are being created with JavaScript, and that would require using selenium and a webdriver like ChromeDriver. Using BeautifulSoup and requests we can see that the content you probably want is not in the html code, the logic for solving your issue would be this:

    from bs4 import BeautifulSoup
    import requests
    url = "http://www.mackolik.com/Genis-Iddaa-Programi"
    data = requests.get(url).text
    soup = BeautifulSoup(data, 'html.parser')
    
    for tag in soup.findAll('a'):
        if "javascript" in tag['href']:
            print(tag.text)
    

    The code above checks if the substring "javascript" is in the href attribute and prints the tag's text if true.

    With selenium and ChromeDriver the logic is pretty much the same, but we need other methods:

    from selenium import webdriver
    
    url = "http://www.mackolik.com/Genis-Iddaa-Programi"
    driver = webdriver.Chrome()
    driver.get(url)
    
    for tag in driver.find_elements_by_tag_name("a"):
        if "javascript" in tag.get_attribute("href"):
            print(tag.text)