From 2f89c171b3cd3778c90c9ebd26649c850ad7b811 Mon Sep 17 00:00:00 2001 From: Miu Razvan Date: Wed, 30 Jul 2025 16:05:19 +0300 Subject: [PATCH 1/3] Remove unnecessary resources requests --- backend/onyx/connectors/web/connector.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 0b5cf5dfbe0..2bb979baf71 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -18,6 +18,8 @@ from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright +from playwright.sync_api import Route +from playwright.sync_api import Request from requests_oauthlib import OAuth2Session # type:ignore from urllib3.exceptions import MaxRetryError @@ -327,6 +329,13 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]: return playwright, context +def abort_unnecessary_resources(route: Route, request: Request) -> None: + if request.resource_type in ["image", "stylesheet", "font", "media", "websocket", "manifest", "other"]: + route.abort() + else: + route.continue_() + + def extract_urls_from_sitemap(sitemap_url: str) -> list[str]: try: response = requests.get( @@ -538,13 +547,23 @@ def _do_scrape( return result page = session_ctx.playwright_context.new_page() + page.route("**/*", abort_unnecessary_resources) try: - # Can't use wait_until="networkidle" because it interferes with the scrolling behavior + from remote_pdb import RemotePdb + RemotePdb('127.0.0.1', 4444).set_trace() page_response = page.goto( initial_url, timeout=30000, # 30 seconds - wait_until="domcontentloaded", # Wait for DOM to be ready + wait_until="commit", ) + page.wait_for_function("document.readyState === 'interactive'") + page.evaluate(""" + () => { + const images = document.querySelectorAll('img'); + images.forEach(img => img.remove()); + } + """) + page.wait_for_function("document.readyState === 'complete'") # wait for domcontentloaded last_modified = page_response.header_value( "Last-Modified") if page_response else None From 83e7982cad61b0f579f297624a8c6903e832aab4 Mon Sep 17 00:00:00 2001 From: Miu Razvan Date: Wed, 30 Jul 2025 16:05:42 +0300 Subject: [PATCH 2/3] clean up --- backend/onyx/connectors/web/connector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 2bb979baf71..fcf2f4c9d07 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -549,8 +549,6 @@ def _do_scrape( page = session_ctx.playwright_context.new_page() page.route("**/*", abort_unnecessary_resources) try: - from remote_pdb import RemotePdb - RemotePdb('127.0.0.1', 4444).set_trace() page_response = page.goto( initial_url, timeout=30000, # 30 seconds From 8475eeed8e63dd3eedef8f100540842c708b656e Mon Sep 17 00:00:00 2001 From: Miu Razvan Date: Tue, 19 Aug 2025 14:56:11 +0300 Subject: [PATCH 3/3] Add timeout option --- backend/onyx/connectors/web/connector.py | 6 ++++-- web/src/lib/connectors/connectors.tsx | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index fcf2f4c9d07..2b9e33e34c8 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -465,12 +465,14 @@ def __init__( mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well batch_size: int = INDEX_BATCH_SIZE, scroll_before_scraping: bool = False, + timeout: int = 30000, **kwargs: Any, ) -> None: self.mintlify_cleanup = mintlify_cleanup self.batch_size = batch_size self.recursive = False self.scroll_before_scraping = scroll_before_scraping + self.timeout = timeout self.web_connector_type = web_connector_type if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: self.recursive = True @@ -551,7 +553,7 @@ def _do_scrape( try: page_response = page.goto( initial_url, - timeout=30000, # 30 seconds + timeout=self.timeout, # 30 seconds wait_until="commit", ) page.wait_for_function("document.readyState === 'interactive'") @@ -587,7 +589,7 @@ def _do_scrape( page.evaluate( "window.scrollTo(0, document.body.scrollHeight)") # wait for the content to load if we scrolled - page.wait_for_load_state("networkidle", timeout=30000) + page.wait_for_load_state("networkidle", timeout=self.timeout) time.sleep(0.5) # let javascript run new_height = page.evaluate("document.body.scrollHeight") diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 3fc6a3ed7f1..3f80ae642a3 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -177,6 +177,15 @@ export const connectorConfigs: Record< name: "scroll_before_scraping", optional: true, }, + { + type: "number", + query: "Timeout (milliseconds):", + label: "Timeout (milliseconds)", + description: + "Timeout for the website to load the desired content", + name: "timeout", + optional: true, + }, ], overrideDefaultFreq: 60 * 60 * 24, },