You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
1.7 KiB
55 lines
1.7 KiB
from abc import ABC
|
|
from typing import List
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
class BaseCrawler(ABC):
|
|
"""
|
|
Abstract base class for web crawlers.
|
|
|
|
This class defines the interface for crawling web pages and converting them
|
|
into Document objects for further processing.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize the crawler with optional keyword arguments.
|
|
|
|
Args:
|
|
**kwargs: Optional keyword arguments for specific crawler implementations.
|
|
"""
|
|
pass
|
|
|
|
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
|
|
"""
|
|
Crawl a single URL and convert it to Document objects.
|
|
|
|
Args:
|
|
url: The URL to crawl.
|
|
**crawl_kwargs: Optional keyword arguments for the crawling process.
|
|
|
|
Returns:
|
|
A list of Document objects containing the content and metadata from the URL.
|
|
|
|
Note:
|
|
Implementations should include the URL reference in the metadata.
|
|
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
|
|
"""
|
|
pass
|
|
|
|
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
|
|
"""
|
|
Crawl multiple URLs and return a list of Document objects.
|
|
|
|
Args:
|
|
urls: A list of URLs to crawl.
|
|
**crawl_kwargs: Optional keyword arguments for the crawling process.
|
|
|
|
Returns:
|
|
A list of Document objects containing the content and metadata from all URLs.
|
|
"""
|
|
documents = []
|
|
for url in urls:
|
|
documents.extend(self.crawl_url(url, **crawl_kwargs))
|
|
return documents
|
|
|