diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index cd27fb4096b..d98dedcd22c 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -18,6 +18,8 @@ from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright +from playwright.sync_api import Route +from playwright.sync_api import Request from requests_oauthlib import OAuth2Session # type:ignore from urllib3.exceptions import MaxRetryError @@ -328,6 +330,13 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]: return playwright, context +def abort_unnecessary_resources(route: Route, request: Request) -> None: + if request.resource_type in ["image", "stylesheet", "font", "media", "websocket", "manifest", "other"]: + route.abort() + else: + route.continue_() + + def extract_urls_from_sitemap(sitemap_url: str) -> list[str]: try: response = requests.get( @@ -465,6 +474,7 @@ def __init__( batch_size: int = INDEX_BATCH_SIZE, scroll_before_scraping: bool = False, remove_by_selector: list = [], + timeout: int = 30000, **kwargs: Any, ) -> None: self.mintlify_cleanup = mintlify_cleanup @@ -472,6 +482,7 @@ def __init__( self.recursive = False self.scroll_before_scraping = scroll_before_scraping self.remove_by_selector = remove_by_selector or [] + self.timeout = timeout self.web_connector_type = web_connector_type if not isinstance(self.remove_by_selector, list): @@ -556,13 +567,21 @@ def _do_scrape( return result page = session_ctx.playwright_context.new_page() + page.route("**/*", abort_unnecessary_resources) try: - # Can't use wait_until="networkidle" because it interferes with the scrolling behavior page_response = page.goto( initial_url, - timeout=30000, # 30 seconds - wait_until="domcontentloaded", # Wait for DOM to be ready + timeout=self.timeout, # 30 seconds + wait_until="commit", ) + page.wait_for_function("document.readyState === 'interactive'") + page.evaluate(""" + () => { + const images = document.querySelectorAll('img'); + images.forEach(img => img.remove()); + } + """) + page.wait_for_function("document.readyState === 'complete'") # wait for domcontentloaded last_modified = page_response.header_value( "Last-Modified") if page_response else None @@ -588,7 +607,7 @@ def _do_scrape( page.evaluate( "window.scrollTo(0, document.body.scrollHeight)") # wait for the content to load if we scrolled - page.wait_for_load_state("networkidle", timeout=30000) + page.wait_for_load_state("networkidle", timeout=self.timeout) time.sleep(0.5) # let javascript run new_height = page.evaluate("document.body.scrollHeight") diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 19d51848b55..46517258021 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -186,6 +186,15 @@ export const connectorConfigs: Record< name: "remove_by_selector", optional: true }, + { + type: "number", + query: "Timeout (milliseconds):", + label: "Timeout (milliseconds)", + description: + "Timeout for the website to load the desired content", + name: "timeout", + optional: true, + }, ], overrideDefaultFreq: 60 * 60 * 24, },