You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
4.0 KiB
112 lines
4.0 KiB
import unittest
|
|
import os
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
import requests
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler import JinaCrawler
|
|
|
|
|
|
class TestJinaCrawler(unittest.TestCase):
|
|
"""Tests for the JinaCrawler class."""
|
|
|
|
@patch.dict(os.environ, {"JINA_API_TOKEN": "fake-token"})
|
|
def test_init_with_token(self):
|
|
"""Test initialization with API token in environment."""
|
|
crawler = JinaCrawler()
|
|
self.assertEqual(crawler.jina_api_token, "fake-token")
|
|
|
|
@patch.dict(os.environ, {"JINAAI_API_KEY": "fake-key"})
|
|
def test_init_with_alternative_key(self):
|
|
"""Test initialization with alternative API key in environment."""
|
|
crawler = JinaCrawler()
|
|
self.assertEqual(crawler.jina_api_token, "fake-key")
|
|
|
|
@patch.dict(os.environ, {}, clear=True)
|
|
def test_init_without_token(self):
|
|
"""Test initialization without API token raises ValueError."""
|
|
with self.assertRaises(ValueError):
|
|
JinaCrawler()
|
|
|
|
@patch.dict(os.environ, {"JINA_API_TOKEN": "fake-token"})
|
|
@patch("requests.get")
|
|
def test_crawl_url(self, mock_get):
|
|
"""Test crawling a URL."""
|
|
# Set up the mock response
|
|
mock_response = MagicMock()
|
|
mock_response.text = "# Markdown Content\nThis is a test."
|
|
mock_response.status_code = 200
|
|
mock_response.headers = {"Content-Type": "text/markdown"}
|
|
mock_get.return_value = mock_response
|
|
|
|
# Create the crawler and crawl a test URL
|
|
crawler = JinaCrawler()
|
|
url = "https://example.com"
|
|
documents = crawler.crawl_url(url)
|
|
|
|
# Check that requests.get was called correctly
|
|
mock_get.assert_called_once_with(
|
|
f"https://r.jina.ai/{url}",
|
|
headers={
|
|
"Authorization": "Bearer fake-token",
|
|
"X-Return-Format": "markdown",
|
|
}
|
|
)
|
|
|
|
# Check the results
|
|
self.assertEqual(len(documents), 1)
|
|
document = documents[0]
|
|
|
|
# Check the content
|
|
self.assertEqual(document.page_content, mock_response.text)
|
|
|
|
# Check the metadata
|
|
self.assertEqual(document.metadata["reference"], url)
|
|
self.assertEqual(document.metadata["status_code"], 200)
|
|
self.assertEqual(document.metadata["headers"], {"Content-Type": "text/markdown"})
|
|
|
|
@patch.dict(os.environ, {"JINA_API_TOKEN": "fake-token"})
|
|
@patch("requests.get")
|
|
def test_crawl_url_http_error(self, mock_get):
|
|
"""Test handling of HTTP errors."""
|
|
# Set up the mock response to raise an HTTPError
|
|
mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error")
|
|
|
|
# Create the crawler
|
|
crawler = JinaCrawler()
|
|
|
|
# Crawl a URL and check that the error is propagated
|
|
with self.assertRaises(requests.exceptions.HTTPError):
|
|
crawler.crawl_url("https://example.com")
|
|
|
|
@patch.dict(os.environ, {"JINA_API_TOKEN": "fake-token"})
|
|
@patch("requests.get")
|
|
def test_crawl_urls(self, mock_get):
|
|
"""Test crawling multiple URLs."""
|
|
# Set up the mock response
|
|
mock_response = MagicMock()
|
|
mock_response.text = "# Markdown Content\nThis is a test."
|
|
mock_response.status_code = 200
|
|
mock_response.headers = {"Content-Type": "text/markdown"}
|
|
mock_get.return_value = mock_response
|
|
|
|
# Create the crawler and crawl multiple URLs
|
|
crawler = JinaCrawler()
|
|
urls = ["https://example.com", "https://example.org"]
|
|
documents = crawler.crawl_urls(urls)
|
|
|
|
# Check that requests.get was called twice
|
|
self.assertEqual(mock_get.call_count, 2)
|
|
|
|
# Check the results
|
|
self.assertEqual(len(documents), 2)
|
|
|
|
# Check that each document has the correct reference
|
|
references = [doc.metadata["reference"] for doc in documents]
|
|
for url in urls:
|
|
self.assertIn(url, references)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|