Search code examples
pythonweb-scrapingbeautifulsouptiktok

how to extract a link inside <script> tag


I'm trying to get a .mp3 file in a link from TikTok sounds the problem is I can't extract it because it's inside <"script"> tag
I'm using pycurl instead of requests

all i need is to extract this from the response then extract the URL from UrlList" "playUrl":{"Uri":"musically-maliva-obj/7038595527327419141.mp3","UrlList":["https://sf16-ies-music-va.tiktokcdn.com/obj/musically-maliva-obj/7038595527327419141.mp3"]}

import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup


url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
#response = response.split('"')
#response = response[1]
#response = response.split('.html?')
#response= response[0]
a = response.split("'")  # gives me a list and i don't know how to search in it 
soup = BeautifulSoup(response, 'html.parser')  # cause response is a string
link = soup.find("script", id="sigi-persisted-data")  #i tried to use bs4 but i couldn't find a reasult
print(link)

Solution

  • You can try extracting the json data, parse it to dictionary value and then navigate dictionary to get the data (json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])

    import pycurl
    from io import BytesIO
    import certifi
    from bs4 import BeautifulSoup
    import re
    import json
    
    
    url = "https://vm.tiktok.com/ZML1t1vW7/"
    buffer = BytesIO()
    c = pycurl.Curl()
    c.setopt(pycurl.CAINFO, certifi.where())
    c.setopt(c.URL, url)
    c.setopt(pycurl.SSL_VERIFYPEER, 0)
    c.setopt(pycurl.SSL_VERIFYHOST, 0)
    c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
    c.setopt(c.WRITEDATA, buffer)
    c.setopt(c.FOLLOWLOCATION, True)
    c.perform()
    c.close()
    body = buffer.getvalue()
    response = body.decode('utf-8')
    soup = BeautifulSoup(response, 'html.parser')
    scripts = soup.findAll("script")
    
    for s in scripts:
        s_str = str(s)
        res = re.search(r'<script>window.__INIT_PROPS__ = (.*)</script>', s_str)
        if res:
            json_data = json.loads(res.group(1))
            print(json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])