Search code examples
pythonpython-3.xbeautifulsoupwebrequestdataformat

Formatting Python beautifulsoup data and remove duplicates first columns values


I have the following snippet that already works however, I wanted to clean up a bit in the formatting by removing some duplicates 1st column data and make it more readable.

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig

req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    print (str(txnhash) + "  " + str(value) + "   " + str(token))

Current Output:

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768    TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  0.62679168    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227    WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  0.003    Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117    Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  1.251364193609566793    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.03997685638568537    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.041171860015645402    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.089939749761843203    Wrapped BNB (WBNB)

Wanted Improvement:

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768                 TrusterCoin (TSC)
                                                                    0.62679168                    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227     WoofCoin (WOOF)
                                                                    0.003                         Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117         Binance-Peg ... (BUSD)
                                                                    1.251364193609566793          Binance-Peg ... (ADA)
                                                                    0.03997685638568537           Binance-Peg ... (ADA)
                                                                    0.041171860015645402          Binance-Peg ... (ADA)
                                                                    0.089939749761843203          Wrapped BNB (WBNB)

Solution

  • Try this:

    from urllib.request import Request, urlopen,urljoin
    from bs4 import BeautifulSoup
    import re, random, ctypes
    import requests
    from time import sleep
    
    url = 'https://bscscan.com/tokentxns'
    user_agent_list = [
    "header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
    "header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
    "header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
    "header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
    "header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
    "header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
    ]
    
    header = random.choice(user_agent_list)
    pausesleep = float(random.randint(10000,30000)) / 10000
    
    req = requests.get(url,header, timeout=10)
    soup = BeautifulSoup(req.content, 'html.parser')
    rows = soup.findAll('table')[0].findAll('tr')
    
    ne=[]
    for row in rows[1:]:
        tds = row.find_all('td')
        txnhash = tds[1].text[0:]
        age = tds[2].text[0:]
        value = tds[7].text[0:]
        token = tds[8].text[0:]
        link = urljoin(url, tds[8].find('a')['href'])
        if str(txnhash) not in ne:
            ne.append(str(txnhash))
            print (str(txnhash),end=" ")
        else:# If you want those tab also then. Otherwise remove else
            print("\t\t\t",end=" ")
        print(str(value) + "   " + str(token))
    

    We are creating list of txnhash in ne and then checking everytime if new txnhash is in that list or not.