Search code examples
pythonseleniumselenium-webdriverbeautifulsouppyvirtualdisplay

cannot print out the combined table properly from beautifulsoup


Since this URL table is combined, so cannot print out the table as expected and the output formatting is very strange, thanks !

# -*- coding:UTF-8 -*-
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
from bs4 import BeautifulSoup
from selenium import webdriver
import re

driver = webdriver.Firefox()
driver.get("url")

soup = BeautifulSoup(driver.page_source.encode('utf-8'),'html.parser')
rows = soup.findAll("td", {"class" : re.compile('table_eng_small_text_.\d')})

result = ','.join(r.text for r in rows)
print(result)

driver.close()
display.stop()

expected output:

1   10  SEASONS KING(T032)  12  7-1/4   7   4-3/4   1   1-1/4   1.09.35 25.06   22.31   21.98 
2   2   HAPPY SOUND(V107)   1   1/2 1   3/4 2   1-1/4   1.09.56 23.90   22.71   22.95 
3   14  NATURAL FRIENDSHIP(S359)    4   2-1/2   4   2-3/4   3   1-1/2   1.09.59 24.30   22.75   22.54 
4   13  LUCKY PLACE(T004)   14  10  13  6-1/2   4   3   1.09.84 25.50   22.15   22.19 
5   9   NO LAUGHING MATTER(V032)    9   5-3/4   9   5   5   3-1/2   1.09.89 24.82   22.59   22.48 
6   1   FREE NOVEMBER(T123) 5   4   5   3-3/4   6   4-1/2   1.10.07 24.54   22.67   22.86 
7   7   FRIENDS FOREVER(T079)   2   1/2 2   3/4 7   4-1/2   1.10.08 23.98   22.75   23.35 
8   5   REAL SUPREME(L247)  3   1-1/2   3   2-1/4   8   5-3/4   1.10.27 24.14   22.83   23.30 
9   6   BE THERE AHEAD(S193)    6   4-1/4   6   3-3/4   9   6-1/4   1.10.34 24.58   22.63   23.13 
10  8   GOLD PRECIOUS(P364) 13  8-1/4   11  6-1/2   10  6-3/4   1.10.41 25.22   22.43   22.76 
11  11  DUTCH WINDMILL(T288)    10  5-3/4   14  7   11  7   1.10.48 24.82   22.91   22.75 
12  3   HAPPY THREE(V162)   11  5-3/4   8   4-3/4   12  7   1.10.49 24.82   22.55   23.12 
13  4   SILVER GATSBY(T161) 8   5-1/2   12  6-1/2   13  7-1/2   1.10.56 24.78   22.87   22.91 
14  12  CHANS DELIGHT(P420) 7   4-1/2   10  5-3/4   14  9-3/4   1.10.92 24.62   22.91   23.39 

Solution

  • You could also try this solution, which just uses BeautifulSoup and requests:

    from bs4 import BeautifulSoup
    from requests import get
    from re import compile
    
    URL = ("http://www.hkjc.com/english/racing/display_sectionaltime.asp?"
           "RaceDate=03/09/2016&Raceno=1&All=0#Race1")
    
    # get html
    html = get(URL).text
    soup = BeautifulSoup(html, 'lxml')
    
    # extract table rows
    rows = soup.findAll("td", {"class" : compile('table_eng_small_text_.\d')})
    
    # get items without tabs, newlines etc.
    items = [r.text.replace('\t', '').replace('\n', '').replace('\r', '').strip()
             for r in rows]
    
    # remove empty items
    items = [item for item in items if item]
    
    # turn table rows into list of lists
    table_rows = [items[i:i+16] for i in range(0, len(items), 16)]
    
    # format and print table contents
    print('\n'.join(','.join(row[:4] + row[6:7] + row[9:10] + row[12:])
                    for row in table_rows))
    

    Which Outputs:

    1,10,SEASONS KING(T032),12    7-1/4,7    4-3/4,1    1-1/4,1.09.35,25.06,22.31,21.98
    2,2,HAPPY SOUND(V107),1    1/2,1    3/4,2    1-1/4,1.09.56,23.90,22.71,22.95
    3,14,NATURAL FRIENDSHIP(S359),4    2-1/2,4    2-3/4,3    1-1/2,1.09.59,24.30,22.75,22.54
    4,13,LUCKY PLACE(T004),14    10,13    6-1/2,4    3,1.09.84,25.50,22.15,22.19
    ...