This is my code you can see I am using playwright and selectolax to scrape the website. Whenever I execute the script the script extract data from the table on the website upto aria row index 29 and then the execution stops successfully showing no error but I want the script to execute upto aria row index 2509
from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
import time
import pandas as pd
def extract_full_body_html(url):
TIMEOUT = 30000 # Reduced timeout to prevent long waits
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Maximize the window
page.set_viewport_size({'width': 1920, 'height': 1080})
page.goto(url, wait_until='networkidle')
# Wait for the initial dynamic content to load
page.wait_for_selector('div[role="gridcell"]', timeout=TIMEOUT) # Adjusted selector
# Scroll down and periodically check for new content
def load_more_content():
last_row_index = 0
while True:
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(10) # Wait for the page to load more content
# Check for new elements based on the aria-rowindex attribute
new_last_row_index = int(page.evaluate('''() => {
const rows = document.querySelectorAll('div[role="gridcell"][aria-rowindex]');
return rows[rows.length - 1].getAttribute("aria-rowindex");
}'''))
if new_last_row_index <= last_row_index:
break # No new data loaded, stop the process
last_row_index = new_last_row_index
# Small delay to ensure all data is loaded for the new rows
time.sleep(2)
load_more_content()
return page.inner_html('body')
def extraction(html):
tree = HTMLParser(html)
data = []
# Adjust the range if you expect more or fewer rows
for i in range(1, 2510): # Extract data up to aria row index 2509
row_selector = f'div[role="gridcell"][aria-rowindex="{i}"]'
company_div = tree.css_first(f'{row_selector}[aria-colindex="1"]')
if company_div is None:
break # Exit if no more rows are found
# Extracting data for each column in the row
row_data = {
'Company': company_div.text(deep=True, separator=' '),
'Emails': tree.css_first(f'{row_selector}[aria-colindex="2"]').text(deep=True, separator=' '),
'Addresses': tree.css_first(f'{row_selector}[aria-colindex="3"]').text(deep=True, separator=' '),
'Urls': tree.css_first(f'{row_selector}[aria-colindex="4"]').text(deep=True, separator=' '),
'Description': tree.css_first(f'{row_selector}[aria-colindex="5"]').text(deep=True, separator=' '),
'Stage': tree.css_first(f'{row_selector}[aria-colindex="6"]').text(deep=True, separator=' '),
'Number of Portfolio Organizations': tree.css_first(f'{row_selector}[aria-colindex="7"]').text(deep=True, separator=' '),
'Number of Investments': tree.css_first(f'{row_selector}[aria-colindex="8"]').text(deep=True, separator=' '),
'Accelerator Duration (in weeks)': tree.css_first(f'{row_selector}[aria-colindex="9"]').text(deep=True, separator=' '),
'Number of Exits': tree.css_first(f'{row_selector}[aria-colindex="10"]').text(deep=True, separator=' '),
'Linkedin': tree.css_first(f'{row_selector}[aria-colindex="11"]').text(deep=True, separator=' '),
'Founders': tree.css_first(f'{row_selector}[aria-colindex="12"]').text(deep=True, separator=' '),
'Twitter': tree.css_first(f'{row_selector}[aria-colindex="13"]').text(deep=True, separator=' ')
}
data.append(row_data)
return data
if __name__ == '__main__':
url = 'https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo'
html = extract_full_body_html(url)
data = extraction(html)
df = pd.DataFrame(data)
df.to_excel('output.xlsx', index=False)
In my script I think the html content of the page is not fully available to be scraped or as the script executes further html of the page is not loaded or visibile to be scraped.
I think this is more or less what you want:
import time
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto('https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo')
#We make click into the table (Otherwise we can not make scroll)
page.locator("//div[@data-testid='contact-table']").click()
# We make scroll till the end of the page
for i in range(5): # make the range as long as needed
page.mouse.wheel(0, 150000)
time.sleep(1)
# We get the aria-rowindex of the last row of the table
print(page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex'))
num_rows = page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex')
# We make scrill again till the top of the page again
for i in range(5): # make the range as long as needed
page.mouse.wheel(0, -150000)
time.sleep(1)
# We iterate to take all the data using the num of rows we previously took
for i in range(1, int(num_rows)+1):
page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']").scroll_into_view_if_needed()
company = page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']//span[2]").inner_text()
email = page.locator(f"//div[@role='row' and @aria-rowindex='{i}']//div[@aria-colindex='2']/span").inner_text()
print(f"{i} - {company} - {email}")
I left some comments into the code to explain what the code is doing.
Basically, as you said, the page is loaded by Javascript, so I think the key is to take the last row and then scrolling row by row till we get all the data.
I just extracted a couple of columns, but I think for you should be easy to take the rest of them.
Good luck!