-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
45 lines (40 loc) · 1.42 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""爬虫"""
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
async def craw(url: str):
browser_config = BrowserConfig() # Default browser configuration
run_config = CrawlerRunConfig() # Default crawl run configuration
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=run_config
)
return result.markdown
async def crawl_batch(urls: list[str]):
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=False # Default: get all results at once
)
res = []
async with AsyncWebCrawler(config=browser_config) as crawler:
# Get all results at once
results = await crawler.arun_many(
urls=urls,
config=run_config,
# dispatcher=dispatcher
)
res = []
for result in results:
if result.success:
res.append({
"url": result.redirected_url,
"markdown": result.markdown,
})
return res
if __name__ == "__main__":
# asyncio.run(craw('https://baijiahao.baidu.com/s?id=1822380969840180762'))
asyncio.run(crawl_batch(
['https://baijiahao.baidu.com/s?id=1822380969840180762']
))