You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
2.3 KiB

2 weeks ago
import logging
import os
from deepsearcher.offline_loading import load_from_local_files, load_from_website
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
# Suppress unnecessary logging from third-party libraries
logging.getLogger("httpx").setLevel(logging.WARNING)
def main():
# Step 1: Initialize configuration
config = Configuration()
# Configure Vector Database and Docling providers
config.set_provider_config("vector_db", "Milvus", {})
config.set_provider_config("file_loader", "DoclingLoader", {})
config.set_provider_config("web_crawler", "DoclingCrawler", {})
# Apply the configuration
init_config(config)
# Step 2a: Load data from a local file using DoclingLoader
local_file = "your_local_file_or_directory"
local_collection_name = "DoclingLocalFiles"
local_collection_description = "Milvus Documents loaded using DoclingLoader"
print("\n=== Loading local files using DoclingLoader ===")
try:
load_from_local_files(
paths_or_directory=local_file,
collection_name=local_collection_name,
collection_description=local_collection_description,
force_new_collection=True
)
print(f"Successfully loaded: {local_file}")
except ValueError as e:
print(f"Validation error: {str(e)}")
except Exception as e:
print(f"Error: {str(e)}")
print("Successfully loaded all local files")
# Step 2b: Crawl URLs using DoclingCrawler
urls = [
# Markdown documentation files
"https://milvus.io/docs/quickstart.md",
"https://milvus.io/docs/overview.md",
# PDF example - can handle various URL formats
"https://arxiv.org/pdf/2408.09869",
]
web_collection_name = "DoclingWebCrawl"
web_collection_description = "Milvus Documentation crawled using DoclingCrawler"
print("\n=== Crawling web pages using DoclingCrawler ===")
load_from_website(
urls=urls,
collection_name=web_collection_name,
collection_description=web_collection_description,
force_new_collection=True
)
print("Successfully crawled all URLs")
# Step 3: Query the loaded data
question = "What is Milvus?"
result = query(question)
print(result)
if __name__ == "__main__":
main()