2
2
from .async_configs import CrawlerRunConfig
3
3
from .models import (
4
4
CrawlResult ,
5
+ CrawlResultContainer ,
5
6
CrawlerTaskResult ,
6
7
CrawlStatus ,
7
8
DisplayMode ,
8
9
CrawlStats ,
9
10
DomainState ,
10
11
)
11
12
13
+ from .types import AsyncWebCrawler
14
+
12
15
from rich .live import Live
13
16
from rich .table import Table
14
17
from rich .console import Console
@@ -289,7 +292,6 @@ def __init__(
289
292
rate_limiter : Optional [RateLimiter ] = None ,
290
293
monitor : Optional [CrawlerMonitor ] = None ,
291
294
):
292
- self .crawler = None
293
295
self ._domain_last_hit : Dict [str , float ] = {}
294
296
self .concurrent_sessions = 0
295
297
self .rate_limiter = rate_limiter
@@ -298,6 +300,7 @@ def __init__(
298
300
@abstractmethod
299
301
async def crawl_url (
300
302
self ,
303
+ crawler : AsyncWebCrawler ,
301
304
url : str ,
302
305
config : CrawlerRunConfig ,
303
306
task_id : str ,
@@ -308,7 +311,7 @@ async def crawl_url(
308
311
async def run_urls (
309
312
self ,
310
313
urls : List [str ],
311
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
314
+ crawler : AsyncWebCrawler ,
312
315
config : CrawlerRunConfig ,
313
316
) -> List [CrawlerTaskResult ]:
314
317
pass
@@ -317,7 +320,7 @@ async def run_urls(
317
320
async def run_urls_stream (
318
321
self ,
319
322
urls : List [str ],
320
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
323
+ crawler : AsyncWebCrawler ,
321
324
config : CrawlerRunConfig ,
322
325
) -> AsyncGenerator [CrawlerTaskResult , None ]:
323
326
yield NotImplemented
@@ -342,6 +345,7 @@ def __init__(
342
345
343
346
async def crawl_url (
344
347
self ,
348
+ crawler : AsyncWebCrawler ,
345
349
url : str ,
346
350
config : CrawlerRunConfig ,
347
351
task_id : str ,
@@ -362,7 +366,7 @@ async def crawl_url(
362
366
363
367
process = psutil .Process ()
364
368
start_memory = process .memory_info ().rss / (1024 * 1024 )
365
- result = await self . crawler .arun (url , config = config , session_id = task_id )
369
+ result : CrawlResultContainer = await crawler .arun (url , config = config , session_id = task_id )
366
370
end_memory = process .memory_info ().rss / (1024 * 1024 )
367
371
368
372
memory_usage = peak_memory = end_memory - start_memory
@@ -372,7 +376,7 @@ async def crawl_url(
372
376
error_message = f"Rate limit retry count exceeded for domain { urlparse (url ).netloc } "
373
377
if self .monitor :
374
378
self .monitor .update_task (task_id , status = CrawlStatus .FAILED )
375
- result = CrawlerTaskResult (
379
+ task_result : CrawlerTaskResult = CrawlerTaskResult (
376
380
task_id = task_id ,
377
381
url = url ,
378
382
result = result ,
@@ -382,8 +386,8 @@ async def crawl_url(
382
386
end_time = time .time (),
383
387
error_message = error_message ,
384
388
)
385
- await self .result_queue .put (result )
386
- return result
389
+ await self .result_queue .put (task_result )
390
+ return task_result
387
391
388
392
if not result .success :
389
393
error_message = result .error_message
@@ -396,8 +400,10 @@ async def crawl_url(
396
400
error_message = str (e )
397
401
if self .monitor :
398
402
self .monitor .update_task (task_id , status = CrawlStatus .FAILED )
399
- result = CrawlResult (
400
- url = url , html = "" , metadata = {}, success = False , error_message = str (e )
403
+ result = CrawlResultContainer (
404
+ CrawlResult (
405
+ url = url , html = "" , metadata = {}, success = False , error_message = str (e )
406
+ )
401
407
)
402
408
403
409
finally :
@@ -420,17 +426,15 @@ async def crawl_url(
420
426
peak_memory = peak_memory ,
421
427
start_time = start_time ,
422
428
end_time = end_time ,
423
- error_message = error_message ,
429
+ error_message = error_message or "" ,
424
430
)
425
431
426
432
async def run_urls (
427
433
self ,
428
434
urls : List [str ],
429
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
435
+ crawler : AsyncWebCrawler ,
430
436
config : CrawlerRunConfig ,
431
437
) -> List [CrawlerTaskResult ]:
432
- self .crawler = crawler
433
-
434
438
if self .monitor :
435
439
self .monitor .start ()
436
440
@@ -458,7 +462,7 @@ async def run_urls(
458
462
continue
459
463
460
464
url , task_id = task_queue .pop (0 )
461
- task = asyncio .create_task (self .crawl_url (url , config , task_id ))
465
+ task = asyncio .create_task (self .crawl_url (crawler , url , config , task_id ))
462
466
active_tasks .append (task )
463
467
464
468
if not active_tasks :
@@ -480,10 +484,9 @@ async def run_urls(
480
484
async def run_urls_stream (
481
485
self ,
482
486
urls : List [str ],
483
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
487
+ crawler : AsyncWebCrawler ,
484
488
config : CrawlerRunConfig ,
485
489
) -> AsyncGenerator [CrawlerTaskResult , None ]:
486
- self .crawler = crawler
487
490
if self .monitor :
488
491
self .monitor .start ()
489
492
@@ -508,7 +511,7 @@ async def run_urls_stream(
508
511
continue
509
512
510
513
url , task_id = task_queue .pop (0 )
511
- task = asyncio .create_task (self .crawl_url (url , config , task_id ))
514
+ task = asyncio .create_task (self .crawl_url (crawler , url , config , task_id ))
512
515
active_tasks .append (task )
513
516
514
517
if not active_tasks and not task_queue :
@@ -546,6 +549,7 @@ def __init__(
546
549
547
550
async def crawl_url (
548
551
self ,
552
+ crawler : AsyncWebCrawler ,
549
553
url : str ,
550
554
config : CrawlerRunConfig ,
551
555
task_id : str ,
@@ -570,7 +574,7 @@ async def crawl_url(
570
574
async with semaphore :
571
575
process = psutil .Process ()
572
576
start_memory = process .memory_info ().rss / (1024 * 1024 )
573
- result = await self . crawler .arun (url , config = config , session_id = task_id )
577
+ result : CrawlResultContainer = await crawler .arun (url , config = config , session_id = task_id )
574
578
end_memory = process .memory_info ().rss / (1024 * 1024 )
575
579
576
580
memory_usage = peak_memory = end_memory - start_memory
@@ -625,16 +629,15 @@ async def crawl_url(
625
629
peak_memory = peak_memory ,
626
630
start_time = start_time ,
627
631
end_time = end_time ,
628
- error_message = error_message ,
632
+ error_message = error_message or "" ,
629
633
)
630
634
631
635
async def run_urls (
632
636
self ,
633
637
urls : List [str ],
634
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
638
+ crawler : AsyncWebCrawler ,
635
639
config : CrawlerRunConfig ,
636
640
) -> List [CrawlerTaskResult ]:
637
- self .crawler = crawler
638
641
if self .monitor :
639
642
self .monitor .start ()
640
643
@@ -647,7 +650,7 @@ async def run_urls(
647
650
if self .monitor :
648
651
self .monitor .add_task (task_id , url )
649
652
task = asyncio .create_task (
650
- self .crawl_url (url , config , task_id , semaphore )
653
+ self .crawl_url (crawler , url , config , task_id , semaphore )
651
654
)
652
655
tasks .append (task )
653
656
@@ -659,10 +662,9 @@ async def run_urls(
659
662
async def run_urls_stream (
660
663
self ,
661
664
urls : List [str ],
662
- crawler : " AsyncWebCrawler" , # noqa: F821 # pyright: ignore[reportUndefinedVariable]
665
+ crawler : AsyncWebCrawler ,
663
666
config : CrawlerRunConfig ,
664
667
) -> AsyncGenerator [CrawlerTaskResult , None ]:
665
- self .crawler = crawler
666
668
if self .monitor :
667
669
self .monitor .start ()
668
670
@@ -675,7 +677,7 @@ async def run_urls_stream(
675
677
if self .monitor :
676
678
self .monitor .add_task (task_id , url )
677
679
task = asyncio .create_task (
678
- self .crawl_url (url , config , task_id , semaphore )
680
+ self .crawl_url (crawler , url , config , task_id , semaphore )
679
681
)
680
682
tasks .append (task )
681
683
0 commit comments