Skip to content

Commit d7821e9

Browse files
committed
fix: CrawlResultContainer and more tests
Fix how CrawlResultContainer is used, ensuring all paths return one and deep crawl unwraps so there's only one, not one per result. Move it into models and directly extend CrawlResult so type hinting works as expected for the single result. Pass crawler into crawl_url instead of using field to simplify the code avoiding the need for None checks. Add more examples to AsyncWebCrawler and fix formatting to display correctly in VS Code. Add data checks to google_search crawler. Correct abstract method declarations so they match implementation which relies on yield being called. Fix use of infinite in integer context. Fix use of js parameter instead of js_code. Various fixes to tests, ensuring repeatability and validation using assert instead of just prints. Allow pytest flags to be specified on the command line when running a test directly with python on the cli.
1 parent 5d0df99 commit d7821e9

File tree

70 files changed

+720
-505
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+720
-505
lines changed

crawl4ai/async_dispatcher.py

+27-25
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
from .async_configs import CrawlerRunConfig
33
from .models import (
44
CrawlResult,
5+
CrawlResultContainer,
56
CrawlerTaskResult,
67
CrawlStatus,
78
DisplayMode,
89
CrawlStats,
910
DomainState,
1011
)
1112

13+
from .types import AsyncWebCrawler
14+
1215
from rich.live import Live
1316
from rich.table import Table
1417
from rich.console import Console
@@ -289,7 +292,6 @@ def __init__(
289292
rate_limiter: Optional[RateLimiter] = None,
290293
monitor: Optional[CrawlerMonitor] = None,
291294
):
292-
self.crawler = None
293295
self._domain_last_hit: Dict[str, float] = {}
294296
self.concurrent_sessions = 0
295297
self.rate_limiter = rate_limiter
@@ -298,6 +300,7 @@ def __init__(
298300
@abstractmethod
299301
async def crawl_url(
300302
self,
303+
crawler: AsyncWebCrawler,
301304
url: str,
302305
config: CrawlerRunConfig,
303306
task_id: str,
@@ -308,7 +311,7 @@ async def crawl_url(
308311
async def run_urls(
309312
self,
310313
urls: List[str],
311-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
314+
crawler: AsyncWebCrawler,
312315
config: CrawlerRunConfig,
313316
) -> List[CrawlerTaskResult]:
314317
pass
@@ -317,7 +320,7 @@ async def run_urls(
317320
async def run_urls_stream(
318321
self,
319322
urls: List[str],
320-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
323+
crawler: AsyncWebCrawler,
321324
config: CrawlerRunConfig,
322325
) -> AsyncGenerator[CrawlerTaskResult, None]:
323326
yield NotImplemented
@@ -342,6 +345,7 @@ def __init__(
342345

343346
async def crawl_url(
344347
self,
348+
crawler: AsyncWebCrawler,
345349
url: str,
346350
config: CrawlerRunConfig,
347351
task_id: str,
@@ -362,7 +366,7 @@ async def crawl_url(
362366

363367
process = psutil.Process()
364368
start_memory = process.memory_info().rss / (1024 * 1024)
365-
result = await self.crawler.arun(url, config=config, session_id=task_id)
369+
result: CrawlResultContainer = await crawler.arun(url, config=config, session_id=task_id)
366370
end_memory = process.memory_info().rss / (1024 * 1024)
367371

368372
memory_usage = peak_memory = end_memory - start_memory
@@ -372,7 +376,7 @@ async def crawl_url(
372376
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
373377
if self.monitor:
374378
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
375-
result = CrawlerTaskResult(
379+
task_result: CrawlerTaskResult = CrawlerTaskResult(
376380
task_id=task_id,
377381
url=url,
378382
result=result,
@@ -382,8 +386,8 @@ async def crawl_url(
382386
end_time=time.time(),
383387
error_message=error_message,
384388
)
385-
await self.result_queue.put(result)
386-
return result
389+
await self.result_queue.put(task_result)
390+
return task_result
387391

388392
if not result.success:
389393
error_message = result.error_message
@@ -396,8 +400,10 @@ async def crawl_url(
396400
error_message = str(e)
397401
if self.monitor:
398402
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
399-
result = CrawlResult(
400-
url=url, html="", metadata={}, success=False, error_message=str(e)
403+
result = CrawlResultContainer(
404+
CrawlResult(
405+
url=url, html="", metadata={}, success=False, error_message=str(e)
406+
)
401407
)
402408

403409
finally:
@@ -420,17 +426,15 @@ async def crawl_url(
420426
peak_memory=peak_memory,
421427
start_time=start_time,
422428
end_time=end_time,
423-
error_message=error_message,
429+
error_message=error_message or "",
424430
)
425431

426432
async def run_urls(
427433
self,
428434
urls: List[str],
429-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
435+
crawler: AsyncWebCrawler,
430436
config: CrawlerRunConfig,
431437
) -> List[CrawlerTaskResult]:
432-
self.crawler = crawler
433-
434438
if self.monitor:
435439
self.monitor.start()
436440

@@ -458,7 +462,7 @@ async def run_urls(
458462
continue
459463

460464
url, task_id = task_queue.pop(0)
461-
task = asyncio.create_task(self.crawl_url(url, config, task_id))
465+
task = asyncio.create_task(self.crawl_url(crawler, url, config, task_id))
462466
active_tasks.append(task)
463467

464468
if not active_tasks:
@@ -480,10 +484,9 @@ async def run_urls(
480484
async def run_urls_stream(
481485
self,
482486
urls: List[str],
483-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
487+
crawler: AsyncWebCrawler,
484488
config: CrawlerRunConfig,
485489
) -> AsyncGenerator[CrawlerTaskResult, None]:
486-
self.crawler = crawler
487490
if self.monitor:
488491
self.monitor.start()
489492

@@ -508,7 +511,7 @@ async def run_urls_stream(
508511
continue
509512

510513
url, task_id = task_queue.pop(0)
511-
task = asyncio.create_task(self.crawl_url(url, config, task_id))
514+
task = asyncio.create_task(self.crawl_url(crawler, url, config, task_id))
512515
active_tasks.append(task)
513516

514517
if not active_tasks and not task_queue:
@@ -546,6 +549,7 @@ def __init__(
546549

547550
async def crawl_url(
548551
self,
552+
crawler: AsyncWebCrawler,
549553
url: str,
550554
config: CrawlerRunConfig,
551555
task_id: str,
@@ -570,7 +574,7 @@ async def crawl_url(
570574
async with semaphore:
571575
process = psutil.Process()
572576
start_memory = process.memory_info().rss / (1024 * 1024)
573-
result = await self.crawler.arun(url, config=config, session_id=task_id)
577+
result: CrawlResultContainer = await crawler.arun(url, config=config, session_id=task_id)
574578
end_memory = process.memory_info().rss / (1024 * 1024)
575579

576580
memory_usage = peak_memory = end_memory - start_memory
@@ -625,16 +629,15 @@ async def crawl_url(
625629
peak_memory=peak_memory,
626630
start_time=start_time,
627631
end_time=end_time,
628-
error_message=error_message,
632+
error_message=error_message or "",
629633
)
630634

631635
async def run_urls(
632636
self,
633637
urls: List[str],
634-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
638+
crawler: AsyncWebCrawler,
635639
config: CrawlerRunConfig,
636640
) -> List[CrawlerTaskResult]:
637-
self.crawler = crawler
638641
if self.monitor:
639642
self.monitor.start()
640643

@@ -647,7 +650,7 @@ async def run_urls(
647650
if self.monitor:
648651
self.monitor.add_task(task_id, url)
649652
task = asyncio.create_task(
650-
self.crawl_url(url, config, task_id, semaphore)
653+
self.crawl_url(crawler, url, config, task_id, semaphore)
651654
)
652655
tasks.append(task)
653656

@@ -659,10 +662,9 @@ async def run_urls(
659662
async def run_urls_stream(
660663
self,
661664
urls: List[str],
662-
crawler: "AsyncWebCrawler", # noqa: F821 # pyright: ignore[reportUndefinedVariable]
665+
crawler: AsyncWebCrawler,
663666
config: CrawlerRunConfig,
664667
) -> AsyncGenerator[CrawlerTaskResult, None]:
665-
self.crawler = crawler
666668
if self.monitor:
667669
self.monitor.start()
668670

@@ -675,7 +677,7 @@ async def run_urls_stream(
675677
if self.monitor:
676678
self.monitor.add_task(task_id, url)
677679
task = asyncio.create_task(
678-
self.crawl_url(url, config, task_id, semaphore)
680+
self.crawl_url(crawler, url, config, task_id, semaphore)
679681
)
680682
tasks.append(task)
681683

0 commit comments

Comments
 (0)