from abc import ABC
from typing import List

from langchain_core.documents import Document


class BaseCrawler(ABC):
    """
    Abstract base class for web crawlers.

    This class defines the interface for crawling web pages and converting them
    into Document objects for further processing.
    """

    def __init__(self, **kwargs):
        """
        Initialize the crawler with optional keyword arguments.

        Args:
            **kwargs: Optional keyword arguments for specific crawler implementations.
        """
        pass

    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL and convert it to Document objects.

        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.

        Returns:
            A list of Document objects containing the content and metadata from the URL.

        Note:
            Implementations should include the URL reference in the metadata.
            e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
        """
        pass

    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
        """
        Crawl multiple URLs and return a list of Document objects.

        Args:
            urls: A list of URLs to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.

        Returns:
            A list of Document objects containing the content and metadata from all URLs.
        """
        documents = []
        for url in urls:
            documents.extend(self.crawl_url(url, **crawl_kwargs))
        return documents