I am working on an OSS scraper for various social media platforms. I have a small issue with TikTok. I can successfully scrape the profile and get metadata. However, I also want to pull back metadata about videos associated with the profile. The video information is contained in an XHR call.
However, after loading the page, a login modal appears. I've found that if I click on the Continue as guest
button, the modal disappears and the XHR request is executed. To make things difficult, TikTok uses a generated CSS style for the button.
I've gotten this to work: page.click('.css-dcgpa6-DivBoxContainer');
However, the identifier changes every few minutes.
So my question is, is there a way to:
Here is my code:
def collect(self, username: str) -> dict:
_xhr_calls = []
final_url = f"{TIKTOK_BASE_URL}{username}"
def intercept_response(response):
"""Capture all background requests and save them."""
# We can extract details from background requests
if response.request.resource_type == "xhr":
logging.debug(f"Appending {response.request.url}")
_xhr_calls.append(response)
return response
with sync_playwright() as pw_firefox:
browser = pw_firefox.firefox.launch(headless=True, timeout=self.timeout)
context = browser.new_context(viewport={"width": 1920, "height": 1080},
strict_selectors=False)
page = context.new_page()
# Block cruft
page.route("**/*", AsyncUtils.intercept_route)
# Enable background request intercepting:
page.on("response", intercept_response)
# Navigate to the profile page
page.goto(final_url, referer=final_url)
page.wait_for_timeout(1500)
# Get the page content
html = page.content()
# Parse it.
soup = BeautifulSoup(html, 'html.parser')
# The user info is contained in a large JS object called __UNIVERSAL_DATA_FOR_REHYDRATION__.
tt_script = soup.find('script', attrs={'id': "__UNIVERSAL_DATA_FOR_REHYDRATION__"})
try:
raw_json = json.loads(tt_script.string)
except AttributeError as exc:
raise JSONDecodeError(
f"ScrapeOMatic was unable to parse the data from TikTok user {username}. Please try again.\n {exc}") from exc
user_data = raw_json['__DEFAULT_SCOPE__']['webapp.user-detail']['userInfo']['user']
stats_data = raw_json['__DEFAULT_SCOPE__']['webapp.user-detail']['userInfo']['stats']
"""
button = page.get_by_text('p:has-text("Continue as guest")')
guest_button = page.locator(selector="div", has=button)
if guest_button is not None:
logging.debug("Clicking button.")
guest_button.click(no_wait_after=True)
# page.click('.css-dcgpa6-DivBoxContainer');
# page.click('.emuynwa3');
# page.wait_for_timeout(500)
# page.keyboard.press("PageDown")
# page.wait_for_timeout(500)
# page.keyboard.press("PageDown")
"""
data_calls = [f for f in _xhr_calls if "list" in f.url]
for call in data_calls:
logging.debug(call.json())
profile_data = {
'sec_id': user_data['secUid'],
'id': user_data['id'],
'is_secret': user_data['secret'],
'username': user_data['uniqueId'],
'bio': emoji.demojize(user_data['signature'], delimiters=("", "")),
'avatar_image': user_data['avatarMedium'],
'following': stats_data['followingCount'],
'followers': stats_data['followerCount'],
'language': user_data['language'],
'nickname': emoji.demojize(user_data['nickname'], delimiters=("", "")),
'hearts': stats_data['heart'],
'region': user_data['region'],
'verified': user_data['verified'],
'heart_count': stats_data['heartCount'],
'video_count': stats_data['videoCount'],
'is_verified': user_data['verified'],
# 'videos': videos,
# 'hashtags': self.hashtags
}
return profile_data
Any help would be greatly appreciated. Also here is a link to the GitHub repo: https://github.com/geniza-ai/scrapeomatic
Thanks!!
While I am no python developer (and wont shame myself with trying to brush off my old python knowledge), I can provide a pseudocode-esque way of achieving what you need.
Your questions says:
2.Click on this button using playwright?
If you already know the buttons text, you dont need its class. You have the text so find it via this identifier:
`await expect(page.get_by_text("continue as guest")).to_be_visible()'
Then just click it.