You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

55 lines
1.7 KiB

from abc import ABC
from typing import List
from langchain_core.documents import Document
class BaseCrawler(ABC):
"""
Abstract base class for web crawlers.
This class defines the interface for crawling web pages and converting them
into Document objects for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the crawler with optional keyword arguments.
Args:
**kwargs: Optional keyword arguments for specific crawler implementations.
"""
pass
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL and convert it to Document objects.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from the URL.
Note:
Implementations should include the URL reference in the metadata.
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
"""
pass
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
"""
Crawl multiple URLs and return a list of Document objects.
Args:
urls: A list of URLs to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from all URLs.
"""
documents = []
for url in urls:
documents.extend(self.crawl_url(url, **crawl_kwargs))
return documents