You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
140 lines
4.7 KiB
140 lines
4.7 KiB
import asyncio
|
|
from typing import List
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler.base import BaseCrawler
|
|
from deepsearcher.utils import log
|
|
|
|
|
|
class Crawl4AICrawler(BaseCrawler):
|
|
"""
|
|
Web crawler using the Crawl4AI library.
|
|
|
|
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
|
|
into markdown format for further processing. It supports both single-page crawling
|
|
and batch crawling of multiple pages.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize the Crawl4AICrawler.
|
|
|
|
Args:
|
|
**kwargs: Optional keyword arguments.
|
|
browser_config: Configuration for the browser used by Crawl4AI.
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.crawler = None # Lazy init
|
|
self.browser_config = kwargs.get("browser_config", None)
|
|
|
|
def _lazy_init(self):
|
|
"""
|
|
Initialize the crawler lazily when needed.
|
|
|
|
This method creates the AsyncWebCrawler instance with the provided browser configuration
|
|
only when it's first needed, to avoid unnecessary initialization.
|
|
"""
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
|
|
|
if self.crawler is None:
|
|
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
|
|
self.crawler = AsyncWebCrawler(config=config)
|
|
|
|
async def _async_crawl(self, url: str) -> Document:
|
|
"""
|
|
Asynchronously crawl a single URL.
|
|
|
|
Args:
|
|
url: The URL to crawl.
|
|
|
|
Returns:
|
|
A Document object with the markdown content and metadata from the URL.
|
|
"""
|
|
if self.crawler is None:
|
|
self._lazy_init()
|
|
|
|
async with self.crawler as crawler:
|
|
result = await crawler.arun(url)
|
|
|
|
markdown_content = result.markdown or ""
|
|
|
|
metadata = {
|
|
"reference": url,
|
|
"success": result.success,
|
|
"status_code": result.status_code,
|
|
"media": result.media,
|
|
"links": result.links,
|
|
}
|
|
|
|
if hasattr(result, "metadata") and result.metadata:
|
|
metadata["title"] = result.metadata.get("title", "")
|
|
metadata["author"] = result.metadata.get("author", "")
|
|
|
|
return Document(page_content=markdown_content, metadata=metadata)
|
|
|
|
def crawl_url(self, url: str) -> List[Document]:
|
|
"""
|
|
Crawl a single URL.
|
|
|
|
Args:
|
|
url: The URL to crawl.
|
|
|
|
Returns:
|
|
A list containing a single Document object with the markdown content and metadata,
|
|
or an empty list if an error occurs.
|
|
"""
|
|
try:
|
|
document = asyncio.run(self._async_crawl(url))
|
|
return [document]
|
|
except Exception as e:
|
|
log.error(f"Error during crawling {url}: {e}")
|
|
return []
|
|
|
|
async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
|
|
"""
|
|
Asynchronously crawl multiple URLs.
|
|
|
|
Args:
|
|
urls: A list of URLs to crawl.
|
|
|
|
Returns:
|
|
A list of Document objects with the markdown content and metadata from all URLs.
|
|
"""
|
|
if self.crawler is None:
|
|
self._lazy_init()
|
|
async with self.crawler as crawler:
|
|
results = await crawler.arun_many(urls)
|
|
documents = []
|
|
for result in results:
|
|
markdown_content = result.markdown or ""
|
|
metadata = {
|
|
"reference": result.url,
|
|
"success": result.success,
|
|
"status_code": result.status_code,
|
|
"media": result.media,
|
|
"links": result.links,
|
|
}
|
|
if hasattr(result, "metadata") and result.metadata:
|
|
metadata["title"] = result.metadata.get("title", "")
|
|
metadata["author"] = result.metadata.get("author", "")
|
|
documents.append(Document(page_content=markdown_content, metadata=metadata))
|
|
return documents
|
|
|
|
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
|
|
"""
|
|
Crawl multiple URLs.
|
|
|
|
Args:
|
|
urls: A list of URLs to crawl.
|
|
**crawl_kwargs: Optional keyword arguments for the crawling process.
|
|
|
|
Returns:
|
|
A list of Document objects with the markdown content and metadata from all URLs,
|
|
or an empty list if an error occurs.
|
|
"""
|
|
try:
|
|
return asyncio.run(self._async_crawl_many(urls))
|
|
except Exception as e:
|
|
log.error(f"Error during crawling {urls}: {e}")
|
|
return []
|
|
|