crawler.py

"""爬虫"""
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig


async def craw(url: str):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig()   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        return result.markdown
    
async def crawl_batch(urls: list[str]):
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        stream=False  # Default: get all results at once
    )
    res = []
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Get all results at once
        results = await crawler.arun_many(
            urls=urls,
            config=run_config,
            # dispatcher=dispatcher
        )
        res = []
        for result in results:
            if result.success:
                res.append({
                    "url": result.redirected_url,
                    "markdown": result.markdown,
                })
    return res

if __name__ == "__main__":
    # asyncio.run(craw('https://baijiahao.baidu.com/s?id=1822380969840180762'))
    asyncio.run(crawl_batch(
        ['https://baijiahao.baidu.com/s?id=1822380969840180762']
    ))