我在抓取网站时遇到问题，脚本只提取到aria rowindex 29，但我需要提取到aria rowindex 2509

Question

这是我的代码，你可以看到我正在使用playwright和selectolax来抓取网站的数据。每当我运行这个脚本时，它会从网站的表格中提取数据，最多提取到第29行，然后执行就停止了，虽然没有报错，但我希望这个脚本能一直执行到第2509行。

from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
import time
import pandas as pd


def extract_full_body_html(url):
    TIMEOUT = 30000  # Reduced timeout to prevent long waits

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Maximize the window
        page.set_viewport_size({'width': 1920, 'height': 1080})

        page.goto(url, wait_until='networkidle')

        # Wait for the initial dynamic content to load
        page.wait_for_selector('div[role="gridcell"]', timeout=TIMEOUT)  # Adjusted selector

        # Scroll down and periodically check for new content
        def load_more_content():
            last_row_index = 0
            while True:
                page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(10)  # Wait for the page to load more content

                # Check for new elements based on the aria-rowindex attribute
                new_last_row_index = int(page.evaluate('''() => {
                    const rows = document.querySelectorAll('div[role="gridcell"][aria-rowindex]');
                    return rows[rows.length - 1].getAttribute("aria-rowindex");
                }'''))

                if new_last_row_index <= last_row_index:
                    break  # No new data loaded, stop the process
                last_row_index = new_last_row_index

                # Small delay to ensure all data is loaded for the new rows
                time.sleep(2)

        load_more_content()

        return page.inner_html('body')

def extraction(html):
    tree = HTMLParser(html)
    data = []

    # Adjust the range if you expect more or fewer rows
    for i in range(1, 2510):  # Extract data up to aria row index 2509
        row_selector = f'div[role="gridcell"][aria-rowindex="{i}"]'
        company_div = tree.css_first(f'{row_selector}[aria-colindex="1"]')
        if company_div is None:
            break  # Exit if no more rows are found

        # Extracting data for each column in the row
        row_data = {
            'Company': company_div.text(deep=True, separator=' '),
            'Emails': tree.css_first(f'{row_selector}[aria-colindex="2"]').text(deep=True, separator=' '),
            'Addresses': tree.css_first(f'{row_selector}[aria-colindex="3"]').text(deep=True, separator=' '),
            'Urls': tree.css_first(f'{row_selector}[aria-colindex="4"]').text(deep=True, separator=' '),
            'Description': tree.css_first(f'{row_selector}[aria-colindex="5"]').text(deep=True, separator=' '),
            'Stage': tree.css_first(f'{row_selector}[aria-colindex="6"]').text(deep=True, separator=' '),
            'Number of Portfolio Organizations': tree.css_first(f'{row_selector}[aria-colindex="7"]').text(deep=True, separator=' '),
            'Number of Investments': tree.css_first(f'{row_selector}[aria-colindex="8"]').text(deep=True, separator=' '),
            'Accelerator Duration (in weeks)': tree.css_first(f'{row_selector}[aria-colindex="9"]').text(deep=True, separator=' '),
            'Number of Exits': tree.css_first(f'{row_selector}[aria-colindex="10"]').text(deep=True, separator=' '),
            'Linkedin': tree.css_first(f'{row_selector}[aria-colindex="11"]').text(deep=True, separator=' '),
            'Founders': tree.css_first(f'{row_selector}[aria-colindex="12"]').text(deep=True, separator=' '),
            'Twitter': tree.css_first(f'{row_selector}[aria-colindex="13"]').text(deep=True, separator=' ')

        }
        data.append(row_data)

    return data

if __name__ == '__main__':
    url = 'https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo'
    html = extract_full_body_html(url)
    data = extraction(html)
    df = pd.DataFrame(data)
    df.to_excel('output.xlsx', index=False)

在我的脚本中，我觉得页面的HTML内容没有完全加载出来，或者在脚本继续执行时，页面的HTML没有加载或显示出来，导致无法抓取。

自动化脚本数据提取网页抓取 Playwright selectolax HTML加载表格数据

我在抓取网站时遇到问题，脚本只提取到aria rowindex 29，但我需要提取到aria rowindex 2509

1 个回答

撰写回答