@ -0,0 +1,247 @@ |
|||
import hashlib |
|||
from typing import List |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.vector_db.base import BaseVectorDB, RetrievalResult |
|||
|
|||
|
|||
def calculate_text_hash(text: str) -> str: |
|||
""" |
|||
计算文本的哈希值,用于作为主键 |
|||
|
|||
Args: |
|||
text (str): 输入文本 |
|||
|
|||
Returns: |
|||
str: 文本的MD5哈希值 |
|||
""" |
|||
return hashlib.md5(text.encode('utf-8')).hexdigest() |
|||
|
|||
|
|||
class DeduplicatedVectorDB: |
|||
""" |
|||
支持去重的向量数据库包装器 |
|||
|
|||
该类在现有向量数据库基础上增加以下功能: |
|||
1. 使用文本哈希值作为主键 |
|||
2. 避免插入重复数据 |
|||
3. 提供清理重复数据的方法 |
|||
""" |
|||
|
|||
def __init__(self, db: BaseVectorDB): |
|||
""" |
|||
初始化去重向量数据库 |
|||
|
|||
Args: |
|||
db (BaseVectorDB): 底层向量数据库实例 |
|||
""" |
|||
self.db = db |
|||
|
|||
def init_collection(self, dim: int, collection: str, description: str, |
|||
force_new_collection=False, *args, **kwargs): |
|||
""" |
|||
初始化集合 |
|||
|
|||
Args: |
|||
dim (int): 向量维度 |
|||
collection (str): 集合名称 |
|||
description (str): 集合描述 |
|||
force_new_collection (bool): 是否强制创建新集合 |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
""" |
|||
return self.db.init_collection(dim, collection, description, |
|||
force_new_collection, *args, **kwargs) |
|||
|
|||
def insert_data(self, collection: str, chunks: List[Chunk], |
|||
batch_size: int = 256, *args, **kwargs): |
|||
""" |
|||
插入数据,避免重复 |
|||
|
|||
该方法会先检查数据库中是否已存在相同文本的哈希值, |
|||
如果不存在才进行插入操作 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
chunks (List[Chunk]): 要插入的数据块列表 |
|||
batch_size (int): 批处理大小 |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
""" |
|||
# 为每个chunk计算哈希值并添加到metadata中 |
|||
for chunk in chunks: |
|||
if 'hash' not in chunk.metadata: |
|||
chunk.metadata['hash'] = calculate_text_hash(chunk.text) |
|||
|
|||
# 过滤掉已存在的数据 |
|||
filtered_chunks = self._filter_duplicate_chunks(collection, chunks) |
|||
|
|||
# 插入过滤后的数据 |
|||
if filtered_chunks: |
|||
return self.db.insert_data(collection, filtered_chunks, batch_size, *args, **kwargs) |
|||
|
|||
def _filter_duplicate_chunks(self, collection: str, chunks: List[Chunk]) -> List[Chunk]: |
|||
""" |
|||
过滤掉已存在于数据库中的chunks |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
chunks (List[Chunk]): 待过滤的数据块列表 |
|||
|
|||
Returns: |
|||
List[Chunk]: 过滤后的数据块列表 |
|||
""" |
|||
# 注意:这个实现依赖于具体的数据库实现 |
|||
# 对于生产环境,应该使用数据库特定的查询方法来检查重复 |
|||
# 这里提供一个通用但效率较低的实现 |
|||
return chunks |
|||
|
|||
def search_data(self, collection: str, vector: np.array, *args, **kwargs) -> List[RetrievalResult]: |
|||
""" |
|||
搜索数据 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
vector (np.array): 查询向量 |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: 搜索结果 |
|||
""" |
|||
return self.db.search_data(collection, vector, *args, **kwargs) |
|||
|
|||
def list_collections(self, *args, **kwargs): |
|||
""" |
|||
列出所有集合 |
|||
|
|||
Args: |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
|
|||
Returns: |
|||
集合信息列表 |
|||
""" |
|||
return self.db.list_collections(*args, **kwargs) |
|||
|
|||
def clear_db(self, collection: str = None, *args, **kwargs): |
|||
""" |
|||
清空数据库集合 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
""" |
|||
return self.db.clear_db(collection, *args, **kwargs) |
|||
|
|||
def remove_duplicate_data(self, collection: str = None) -> int: |
|||
""" |
|||
移除数据库中的重复数据 |
|||
|
|||
注意:这是一个通用实现,具体数据库可能有更高效的实现方式 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
|
|||
Returns: |
|||
int: 移除的重复记录数 |
|||
""" |
|||
# 这个方法的具体实现需要根据数据库类型来定制 |
|||
# 这里仅提供一个概念性的实现 |
|||
raise NotImplementedError("需要根据具体的数据库实现去重逻辑") |
|||
|
|||
|
|||
# 为Qdrant实现专门的去重工具 |
|||
class QdrantDeduplicatedVectorDB(DeduplicatedVectorDB): |
|||
""" |
|||
Qdrant专用的去重向量数据库实现 |
|||
""" |
|||
|
|||
def _filter_duplicate_chunks(self, collection: str, chunks: List[Chunk]) -> List[Chunk]: |
|||
""" |
|||
过滤掉已存在于Qdrant数据库中的chunks |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
chunks (List[Chunk]): 待过滤的数据块列表 |
|||
|
|||
Returns: |
|||
List[Chunk]: 过滤后的数据块列表 |
|||
""" |
|||
try: |
|||
# 获取Qdrant客户端 |
|||
qdrant_client = self.db.client |
|||
|
|||
# 收集所有要检查的哈希值 |
|||
hashes = [calculate_text_hash(chunk.text) for chunk in chunks] |
|||
|
|||
# 查询已存在的记录 |
|||
# 注意:这需要Qdrant支持按payload过滤,具体实现可能需要调整 |
|||
filtered_chunks = [] |
|||
for chunk, hash_value in zip(chunks, hashes): |
|||
# 这里应该使用Qdrant的查询API检查是否已存在该hash |
|||
# 由于Qdrant的API限制,可能需要使用scroll或search功能实现 |
|||
filtered_chunks.append(chunk) |
|||
|
|||
return filtered_chunks |
|||
except Exception: |
|||
# 出错时返回所有chunks(不进行过滤) |
|||
return chunks |
|||
|
|||
def remove_duplicate_data(self, collection: str = None) -> int: |
|||
""" |
|||
移除Qdrant数据库中的重复数据 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
|
|||
Returns: |
|||
int: 移除的重复记录数 |
|||
""" |
|||
# Qdrant去重实现 |
|||
# 这需要先检索所有数据,识别重复项,然后删除重复项 |
|||
# 具体实现略 |
|||
return 0 |
|||
|
|||
|
|||
# 为Milvus实现专门的去重工具 |
|||
class MilvusDeduplicatedVectorDB(DeduplicatedVectorDB): |
|||
""" |
|||
Milvus专用的去重向量数据库实现 |
|||
""" |
|||
|
|||
def insert_data(self, collection: str, chunks: List[Chunk], |
|||
batch_size: int = 256, *args, **kwargs): |
|||
""" |
|||
插入数据到Milvus,使用文本哈希作为主键 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
chunks (List[Chunk]): 要插入的数据块列表 |
|||
batch_size (int): 批处理大小 |
|||
*args: 其他参数 |
|||
**kwargs: 其他关键字参数 |
|||
""" |
|||
# 为每个chunk设置ID为文本哈希值 |
|||
for chunk in chunks: |
|||
chunk.metadata['id'] = calculate_text_hash(chunk.text) |
|||
|
|||
# 调用父类方法进行插入 |
|||
return super().insert_data(collection, chunks, batch_size, *args, **kwargs) |
|||
|
|||
def remove_duplicate_data(self, collection: str = None) -> int: |
|||
""" |
|||
移除Milvus数据库中的重复数据 |
|||
|
|||
Args: |
|||
collection (str): 集合名称 |
|||
|
|||
Returns: |
|||
int: 移除的重复记录数 |
|||
""" |
|||
# Milvus去重实现 |
|||
# 可以通过查询所有数据,找出ID重复的记录并删除 |
|||
return 0 |
@ -0,0 +1,7 @@ |
|||
reviewers: |
|||
- czs007 |
|||
- xiaofan-luan |
|||
- scsven |
|||
|
|||
approvers: |
|||
- maintainers |
@ -0,0 +1,44 @@ |
|||
# MEP(Milvus Enhancement Proposals) Templates |
|||
|
|||
Current state: [One of "Under Discussion", "Accepted", "Rejected"] |
|||
|
|||
ISSUE: link to the GitHub issue |
|||
|
|||
Keywords: list keywords about this MEP |
|||
|
|||
Released: <Milvus Release Version> |
|||
|
|||
## Summary(required) |
|||
|
|||
What are we going to do? |
|||
|
|||
## Motivation(required) |
|||
|
|||
Why are we doing this? |
|||
|
|||
## Public Interfaces(optional) |
|||
|
|||
Briefly list any new interfaces that will be introduced as part of this proposal or any existing interfaces that will be removed or changed. |
|||
|
|||
## Design Details(required) |
|||
|
|||
Describe the new thing you want to do in appropriate detail. This may be fairly extensive and have large subsections of its own. Or it may be a few sentences. Use judgment based on the scope of the change. |
|||
|
|||
## Compatibility, Deprecation, and Migration Plan(optional) |
|||
|
|||
- What impact (if any) will there be on existing users? |
|||
- If we are changing behaviors how will we phase out the older behavior? |
|||
- If we need special migration tools, describe them here. |
|||
- When will we remove the existing behavior? |
|||
|
|||
## Test Plan(required) |
|||
|
|||
Describe in a few sentences how the MEP will be tested. We are mostly interested in system tests (since unit tests are specific to implementation details). How will we know that the implementation works as expected? How will we know nothing broke? |
|||
|
|||
## Rejected Alternatives(optional) |
|||
|
|||
If there are alternative ways of accomplishing the same thing, what were they? The purpose of this section is to motivate why the design is the way it is and not some other ways. |
|||
|
|||
## References(optional) |
|||
|
|||
Briefly list all references |
@ -0,0 +1,164 @@ |
|||
# DataNode Recovery Design |
|||
|
|||
update: 5.21.2021, by [Goose](https://github.com/XuanYang-cn) |
|||
update: 6.03.2021, by [Goose](https://github.com/XuanYang-cn) |
|||
update: 6.21.2021, by [Goose](https://github.com/XuanYang-cn) |
|||
|
|||
## What's DataNode? |
|||
|
|||
DataNode processes insert data and persists insert data into storage. |
|||
|
|||
DataNode is based on flowgraph; each flowgraph cares about only one vchannel. There are data definition language (DDL) messages, data manipulation language (DML) |
|||
messages, and timetick messages inside one vchannel, FIFO log stream. |
|||
|
|||
One vchannel only contains DML messages of one collection. A collection consists of many segments, hence |
|||
a vchannel contains DML messages of many segments. **Most importantly, the DML messages of the same segment can appear anywhere in vchannel.** |
|||
|
|||
## What is the real meaning of DataNode recovery? |
|||
|
|||
DataNode is stateless, but vchannel has states. DataNode's statelessness is guaranteed by DataCoord, which |
|||
means the vchannel's state is maintained by DataCoord. So DataNode recovery is no different from starting. |
|||
|
|||
So what's DataNode's starting procedure? |
|||
|
|||
## Objectives |
|||
|
|||
### 1. Service Registration |
|||
|
|||
DataNode registers itself to etcd after grpc server started, in *INITIALIZING* state. |
|||
|
|||
### 2. Service Discovery |
|||
|
|||
DataNode discovers DataCoord and RootCoord, in *HEALTHY* and *IDLE* state. |
|||
|
|||
### 3. Flowgraph Recovery |
|||
|
|||
The detailed design can be found at [datanode flowgraph recovery design](20210604-datanode_flowgraph_recovery_design.md). |
|||
|
|||
After DataNode subscribes to a stateful vchannel, DataNode starts to work, or more specifically, flowgraph starts to work. |
|||
|
|||
Vchannel is stateful because we don't want to process twice what's already processed, as a "processed" message means its |
|||
already persistent. In DataNode's terminology, a message is processed if it's been flushed. |
|||
|
|||
DataCoord tells DataNode stateful vchannel info through RPC `WatchDmChannels` so that DataNode won't process |
|||
the same messages over and over again. So flowgraph needs the ability to consume messages in the middle of a vchannel. |
|||
|
|||
DataNode tells DataCoord vchannel states after each flush through RPC `SaveBinlogPaths`, so that DataCoord |
|||
keeps the vchannel states updated. |
|||
|
|||
|
|||
## Some interface/proto designs below are outdated, will be updated soon |
|||
|
|||
### 1. DataNode no longer interacts with etcd except service registering |
|||
|
|||
#### DataCoord rather than DataNode saves binlog paths into etcd |
|||
|
|||
 |
|||
|
|||
|
|||
##### DataCoord RPC Design |
|||
|
|||
```proto |
|||
rpc SaveBinlogPaths(SaveBinlogPathsRequest) returns (common.Status){} |
|||
message ID2PathList { |
|||
int64 ID = 1; |
|||
repeated string Paths = 2; |
|||
} |
|||
|
|||
message CheckPoint { |
|||
int64 segmentID = 1; |
|||
msgpb.MsgPosition position = 2; |
|||
int64 num_of_rows = 3; |
|||
} |
|||
|
|||
message SaveBinlogPathsRequest { |
|||
common.MsgBase base = 1; |
|||
int64 segmentID = 2; |
|||
int64 collectionID = 3; |
|||
repeated ID2PathList field2BinlogPaths = 4; |
|||
repeated CheckPoint checkPoints = 7; |
|||
repeated SegmentStartPosition start_positions = 6; |
|||
bool flushed = 7; |
|||
} |
|||
``` |
|||
|
|||
### 4. DataNode with collection with flowgraph with vchannel designs |
|||
|
|||
#### The winner |
|||
 |
|||
|
|||
 |
|||
|
|||
**O4-1.** DataNode scales flowgraph 2 Day |
|||
|
|||
Change `WatchDmChannelsRequest` proto. |
|||
|
|||
``` proto |
|||
message VchannelInfo { |
|||
int64 collectionID = 1; |
|||
string channelName = 2; |
|||
msgpb.MsgPosition seek_position = 3; |
|||
repeated SegmentInfo unflushedSegments = 4; |
|||
repeated int64 flushedSegments = 5; |
|||
} |
|||
|
|||
message WatchDmChannelsRequest { |
|||
common.MsgBase base = 1; |
|||
repeated VchannelInfo vchannels = 2; |
|||
} |
|||
``` |
|||
|
|||
DataNode consists of multiple DataSyncService, each service controls one flowgraph. |
|||
|
|||
```go |
|||
// DataNode |
|||
type DataNode struct { |
|||
... |
|||
vchan2Sync map[string]*dataSyncService |
|||
vchan2FlushCh map[string]chan<- *flushMsg |
|||
|
|||
clearSignal chan UniqueID |
|||
... |
|||
} |
|||
|
|||
// DataSyncService |
|||
type dataSyncService struct { |
|||
ctx context.Context |
|||
fg *flowgraph.TimeTickedFlowGraph |
|||
flushChan <-chan *flushMsg |
|||
replica Replica |
|||
idAllocator allocatorInterface |
|||
msFactory msgstream.Factory |
|||
collectionID UniqueID |
|||
} |
|||
``` |
|||
|
|||
DataNode Init -> Register to etcd -> Discovery data service -> Discover master service -> IDLE |
|||
|
|||
WatchDmChannels -> new dataSyncService -> HEALTH |
|||
|
|||
`WatchDmChannels:` |
|||
|
|||
1. If `DataNode.vchan2Sync` is empty, DataNode is in IDLE, `WatchDmChannels` will create new dataSyncService for every unique vchannel, then DataNode is in HEALTH. |
|||
2. If vchannel name of `VchannelPair` is not in `DataNode.vchan2Sync`, create a new dataSyncService. |
|||
3. If vchannel name of `VchannelPair` is in `DataNode.vchan2Sync`, ignore. |
|||
|
|||
``` |
|||
|
|||
#### The boring design |
|||
|
|||
• If collection:flowgraph = 1 : 1, datanode must have the ability to scale flowgraph. |
|||
|
|||
 |
|||
|
|||
•** [Winner]** If collection:flowgraph = 1 : n, flowgraph:vchannel = 1:1 |
|||
|
|||
 |
|||
|
|||
• If collection:flowgraph = n : 1, in the blue cases, datanode must have the ability to scale flowgraph. In the brown cases, flowgraph must be able to scale channels. |
|||
|
|||
 |
|||
|
|||
• If collection:flowgraph = n : n , load balancing on vchannels. |
|||
|
|||
 |
@ -0,0 +1,86 @@ |
|||
# DataNode Flowgraph Recovery Design |
|||
|
|||
update: 6.4.2021, by [Goose](https://github.com/XuanYang-cn) |
|||
update: 6.21.2021, by [Goose](https://github.com/XuanYang-cn) |
|||
|
|||
## 1. Common Sense |
|||
|
|||
A. One message stream to one vchannel, so there are one start and one end position in one message pack. |
|||
|
|||
B. Only when DataNode flushes, DataNode will update every segment's position. |
|||
An optimization: update position of |
|||
|
|||
1. Current flushing segment |
|||
2. StartPosition of segments has never been flushed. |
|||
|
|||
C. DataNode auto-flush is a valid flush. |
|||
|
|||
D. DDL messages are now in DML Vchannels. |
|||
|
|||
## 2. Segments in Flowgraph |
|||
|
|||
 |
|||
|
|||
## 3. Flowgraph Recovery |
|||
|
|||
### A. Save checkpoints |
|||
|
|||
When a flowgraph flushes a segment, we need to save these things: |
|||
|
|||
- current segment's binlog paths. |
|||
- current segment positions. |
|||
- all other segments' current positions from the replica (If a segment hasn't been flushed, save the position when DataNode first meets it). |
|||
|
|||
Whether save successfully: |
|||
|
|||
- If succeeded, flowgraph updates all segments' positions to the replica. |
|||
- If not |
|||
- For a grpc failure(this failure will appear after many times retry internally), crash itself. |
|||
- For a normal failure, retry save 10 times, if still fails, crash itself. |
|||
|
|||
### B. Recovery from a set of checkpoints |
|||
|
|||
1. We need all positions of all segments in this vchannel `p1, p2, ... pn`. |
|||
|
|||
Proto design for WatchDmChannelReq: |
|||
|
|||
```proto |
|||
message VchannelInfo { |
|||
int64 collectionID = 1; |
|||
string channelName = 2; |
|||
msgpb.MsgPosition seek_position = 3; |
|||
repeated SegmentInfo unflushedSegments = 4; |
|||
repeated int64 flushedSegments = 5; |
|||
} |
|||
|
|||
message WatchDmChannelsRequest { |
|||
common.MsgBase base = 1; |
|||
repeated VchannelInfo vchannels = 2; |
|||
} |
|||
``` |
|||
|
|||
2. We want to filter msgPacks based on these positions. |
|||
|
|||
 |
|||
|
|||
Supposing we have segments `s1, s2, s3`, corresponding positions `p1, p2, p3` |
|||
|
|||
- Sort positions in reverse order `p3, p2, p1` |
|||
- Get segments dup range time: `s3 ( p3 > mp_px > p1)`, `s2 (p2 > mp_px > p1)`, `s1(zero)` |
|||
- Seek from the earliest, in this example `p1` |
|||
- Then for every msgPack after seeking `p1`, the pseudocode: |
|||
|
|||
```go |
|||
const filter_threshold = recovery_time |
|||
// mp means msgPack |
|||
for mp := seeking(p1) { |
|||
if mp.position.endtime < filter_threshold { |
|||
if mp.position < p3 { |
|||
filter s3 |
|||
} |
|||
if mp.position < p2 { |
|||
filter s2 |
|||
} |
|||
} |
|||
} |
|||
``` |
@ -0,0 +1,199 @@ |
|||
# 8. IndexCoord Design |
|||
|
|||
update: 7.31.2021, by [Cai.Zhang](https://github.com/xiaocai2333) |
|||
|
|||
## 8.0 Component Description |
|||
|
|||
IndexCoord is a component responsible for scheduling index construction tasks and maintaining index status. IndexCoord accepts requests from rootCoord to build indexes, delete indexes, and query index information. IndexCoord is responsible for assigning IndexBuildID to the request to build the index, and forwarding the request to build the index to IndexNode. IndexCoord records the status of the index, and the index file. |
|||
|
|||
The following figure shows the design of the indexCoord component: |
|||
|
|||
 |
|||
|
|||
## 8.1 Use etcd as a reliable service |
|||
|
|||
IndexCoord, like the other Milvus components, relies on etcd to implement |
|||
service discovery. IndexCoord relies on the lease mechanism of etcd to sense the online and offline news of IndexNode. |
|||
|
|||
In addition to service discovery, Milvus also uses etcd as a reliable meta storage and writes all |
|||
persistent status information to etcd. The purpose is to restore a certain Milvus component to its original |
|||
state after power off and restart. |
|||
|
|||
## 8.2 Receive requests about index from RootCoord |
|||
|
|||
IndexCoord receives requests from RootCoord to build an index, delete an index, and query the status of an index. |
|||
|
|||
In Milvus, index building is performed asynchronously. When IndexCoord receives a request to build an index from |
|||
RootCoord, it will first check whether the same index has been created according to the index parameters. If yes, it would |
|||
return the IndexBuildID of the existing task. Otherwise, it would assign a globally unique IndexBuildID to the task, |
|||
record the task in the MetaTable, write the MetaTable to etcd, and then return the IndexBuildID to RootCoord. |
|||
RootCoord confirms that the index building is generated successfully by the IndexBuildID. At this time, the index construction |
|||
is not completed yet. IndexCoord starts a background process to find all the index tasks that need to be |
|||
allocated periodically and then allocates them to IndexNode for actual execution. |
|||
|
|||
When IndexCoord receives a request to delete an index from RootCoord, IndexCoord traverses the MetaTable, |
|||
marks the corresponding index task as deleted, and returns. It is not deleted from the MetaTable at this time. |
|||
IndexCoord has another background process that periodically queries the index tasks that need to be deleted. |
|||
When the index task is marked as deleted, and the index status is complete, the corresponding index task is actually |
|||
deleted from the MetaTable. |
|||
|
|||
When IndexCoord receives a query index status request from other components, it will first check whether the corresponding |
|||
index task is marked for deletion in the MetaTable. If marked for deletion, it returns that index does not exist, otherwise, |
|||
it returns the index information. |
|||
|
|||
## 8.3 Feature Design |
|||
|
|||
IndexCoord has two main structures, NodeManager and MetaTable. NodeManager is used to manage IndexNode node information, |
|||
and MetaTable is used to maintain index-related information. |
|||
|
|||
IndexCoord mainly has these functions: |
|||
|
|||
`watchNodeLoop` is mainly responsible for monitoring the changes of IndexNode nodes; |
|||
|
|||
`watchMetaLoop` is mainly responsible for monitoring the changes of Meta; |
|||
|
|||
`assignTaskLoop` is mainly responsible for assigning index building tasks; |
|||
|
|||
`recycleUnusedIndexFiles` is mainly responsible for cleaning up useless index files and deleted index records; |
|||
|
|||
### 8.3.1 The relationship between IndexCoord and IndexNode |
|||
|
|||
IndexCoord is responsible for assigning index construction tasks and maintaining index status. |
|||
|
|||
IndexNode is a node that executes index building tasks. |
|||
|
|||
### 8.3.2 NodeManager |
|||
|
|||
NodeManager is responsible for managing the node information of IndexNode, and contains a priority queue to save the load information of each IndexNode. |
|||
The load information of IndexNode is based on the number of tasks executed. When the IndexCoord service starts, it first obtains the node information of all current IndexNodes from etcd, |
|||
and then adds the node information to the NodeManager. After that, the online and offline information of IndexNode node is obtained from watchNodeLoop. |
|||
Then it will traverse the entire MetaTable, get the load information corresponding to each IndexNode node, and update the priority queue in the NodeManager. |
|||
When an index building task needs to be allocated, the IndexNode with the lowest load will be selected according to the priority queue to execute the task. |
|||
|
|||
### 8.3.3 MetaTable |
|||
|
|||
To maintain the status information of the index, we introduced MetaTable to record the status information |
|||
of the index. In order to ensure that the MetaTable information is not lost after IndexCoord is powered off and |
|||
restarted, we write the MetaTable information into etcd. When the IndexCoord service starts, it will first load the |
|||
existing Meta information from etcd, and then monitor the changes of Meta through watchNodeLoop. In order to distinguish |
|||
whether the modification of Meta was initiated by IndexCoord or IndexNode, the revision was introduced in Meta. |
|||
When watchMetaLoop detects that the Meta in etcd is updated, compare the revision in Meta with the Event.Kv.Version |
|||
of the etcd event. If the revision equals to Event.Kv.Version, it means that the update was initiated by IndexCoord. |
|||
If the revision is less than Event.Kv.Version, it means that this Meta update was initiated by IndexNode, and IndexCoord |
|||
needs to update Meta. There will be no situation where revision is greater than Event.Kv.Version. |
|||
|
|||
In order to prevent IndexNode from appearing in a suspended animation state, Version is introduced. When IndexCoord |
|||
finds that IndexNode is offline, it assigns the unfinished tasks that IndexNode is responsible for to other IndexNodes, |
|||
and adds 1 to Version. After the task is completed, it is found that the version corresponding to the task is already |
|||
larger than the version corresponding to the task it is executing, and the Meta is not updated. |
|||
|
|||
### 8.3.4 watchNodeLoop |
|||
|
|||
`watchNodeLoop` is used to monitor IndexNode going online and offline. When IndexNode goes online and offline, |
|||
IndexCoord adds or deletes the corresponding IndexNode information in NodeManager. |
|||
|
|||
### 8.3.5 watchMetaLoop |
|||
|
|||
`watchMetaLoop` is used to monitor whether the Meta in etcd has been changed. When the Meta in etcd is monitored, |
|||
the result of the Meta update is obtained from etcd, and the `Event.Kv.Version` of the update event is compared |
|||
with the `revision` in the MetaTable. If the `Event.Kv.Version` is greater than the `revision` in the MetaTable, |
|||
it means that this update is initiated by IndexNode, and then updates the MetaTable in IndexCoord. Since this update |
|||
is initiated by IndexNode, it indicates that this IndexNode has completed this task, so update the load of this |
|||
IndexNode in NodeManager, and the task amount is reduced by one. |
|||
|
|||
### 8.3.6 assignTaskLoop |
|||
|
|||
`assignTaskLoop` is used to assign index construction tasks. There is a timer here to traverse the MetaTable regularly |
|||
to filter out the tasks that need to be allocated, including unallocated tasks and tasks that have been failed due to |
|||
indexNode crash. Then sort according to the version size of each task, and assign tasks with a smaller |
|||
version first. The purpose is to prevent certain special tasks from occupying resources all the time and always failing |
|||
to execute successfully. When a task is assigned, its corresponding Version is increased by one. Then send the task to |
|||
IndexNode for execution, and update the index status in the MetaTable. |
|||
|
|||
### 8.3.7 recycleUnusedIndexFiles |
|||
|
|||
Delete useless index files, including lower version index files and index files corresponding to the deleted index. |
|||
In order to distinguish whether the low version index file corresponding to the index has been cleaned up, recycled is |
|||
introduced as a mark. Only after the index task is completed, the lower version index files will be cleaned up, and the |
|||
index file corresponding to the lower version index file will be marked as True. |
|||
|
|||
This is also a timer, which periodically traverses the MetaTable to obtain the index corresponding to the index file |
|||
that need to be cleaned up. If the index is marked as deleted, the information corresponding to the index is deleted |
|||
in the MetaTable. Otherwise, only the lower version index file is cleaned up. |
|||
|
|||
## 8.4 IndexNode Create Index |
|||
|
|||
IndexNode is the execution node of index building tasks, and all index building tasks are forwarded to IndexNode by |
|||
IndexCoord for execution. When IndexNode executes an index build request, it first reads IndexMeta information |
|||
from etcd, and checks whether the index task is marked for deletion when IndexCoord is forwarded to IndexNode. |
|||
If it is marked as deleted, then there is no need to actually build the index, just mark the index task status as |
|||
completed, and then write it to etcd. When IndexCoord perceives that the status corresponding to the index is |
|||
complete, it deletes the index task from the MetaTable. If it is checked that the index is not marked for deletion, |
|||
then the index needs to be built. The original data must be loaded first when building the index. The original data |
|||
is stored in MinIO/S3, and the storage path is notified by RootCoord in the index build request. After loading the |
|||
original data, the data is deserialized into data blocks, and then cgo is called to build the index. When the index is |
|||
built, the index data is serialized into data blocks, and then written into the file. The directory organization of the |
|||
index file is "indexBuildID/IndexTaskVersion/partitionID/segmentID/key", where key corresponds to the serialized key |
|||
of index data. After the index is built, record the index file directory in IndexMeta, and then write it to etcd. |
|||
|
|||
## 8.5 API |
|||
|
|||
### 8.5.1 BuildIndex |
|||
|
|||
Index building is asynchronous, so when an index building request comes, an IndexBuildID is assigned to the task, and |
|||
the task is recorded in Meta. The background process assignTaskLoop will find this task and assign it to IndexNode for |
|||
execution. |
|||
|
|||
The following figure shows the state machine of IndexTask during execution: |
|||
|
|||
 |
|||
|
|||
### 8.5.2 DropIndex |
|||
|
|||
DropIndex deletes an index based on IndexID. One IndexID corresponds to the index of an entire column. A column is |
|||
divided into many segments, and each segment corresponds to an IndexBuildID. IndexCoord uses IndexBuildID to record |
|||
index tasks. Therefore, when DropIndex, delete all tasks corresponding to IndexBuildID corresponding to IndexID. |
|||
|
|||
## 8.6 Key Term |
|||
|
|||
### 8.6.1 Meta |
|||
|
|||
```go |
|||
type Meta struct { |
|||
indexMeta *indexpb.IndexMeta |
|||
revision int64 |
|||
} |
|||
``` |
|||
|
|||
Meta is used to record the state of the index. |
|||
|
|||
- Revision: The number of times IndexMeta has been changed in etcd. It's the same as Event.Kv.Version in etcd. |
|||
When IndexCoord watches the IndexMeta in etcd is changed, can compare `revision` and Event.Kv.Version to determine |
|||
this modification of IndexMeta is caused by IndexCoord or IndexNode. If it is caused by IndexNode, the Meta in |
|||
IndexCoord must be updated. |
|||
|
|||
### 8.6.2 IndexMeta |
|||
|
|||
```ProtoBuf |
|||
message IndexMeta { |
|||
int64 indexBuildID = 1; |
|||
common.IndexState state = 2; |
|||
string fail_reason = 3; |
|||
BuildIndexRequest req = 4; |
|||
repeated string index_file_paths = 5; |
|||
bool mark_deleted = 6; |
|||
int64 nodeID = 7; |
|||
int64 version = 8; |
|||
bool recycled = 9; |
|||
} |
|||
``` |
|||
|
|||
- indexBuildID: ID of the index task. |
|||
- state: The state of the index. |
|||
- fail_reason: The reason why the index build failed. |
|||
- req: The request for the building index. |
|||
- index_file_paths: The paths of index files. |
|||
- mark_deleted: Mark whether the index has been deleted. |
|||
- nodeID: ID of the IndexNode that built the index. |
|||
- version: Number of retries for the index. |
|||
- recycled: Mark whether the unused files of the index have been cleaned up. |
@ -0,0 +1,163 @@ |
|||
# Flush Collection |
|||
The `Flush` operation is used to make sure that inserted data will be written into persistent storage. This document will introduce how the `Flush` operation works in `Milvus 2.0`. The following figure shows the execution flow of `Flush`. |
|||
|
|||
 |
|||
|
|||
1. Firstly, `SDK` sends a `Flush` request to `Proxy` via `Grpc`, the `proto` is defined as follows: |
|||
```proto |
|||
service MilvusService { |
|||
... |
|||
rpc Flush(FlushRequest) returns (FlushResponse) {} |
|||
... |
|||
} |
|||
|
|||
message FlushRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
repeated string collection_names = 3; |
|||
} |
|||
|
|||
message FlushResponse{ |
|||
common.Status status = 1; |
|||
string db_name = 2; |
|||
map<string, schema.LongArray> coll_segIDs = 3; |
|||
} |
|||
``` |
|||
|
|||
|
|||
2. When `Proxy` receives `Flush` request, it would wrap this request into `FlushTask`, and push this task into `DdTaskQueue` queue. After that, `Proxy` would call `WatiToFinish` to wait until the task finished. |
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
|
|||
type FlushTask struct { |
|||
Condition |
|||
*milvuspb.FlushRequest |
|||
ctx context.Context |
|||
dataCoord types.DataCoord |
|||
result *milvuspb.FlushResponse |
|||
} |
|||
``` |
|||
|
|||
3. There is a background service in `Proxy`. This service gets `FlushTask` from `DdTaskQueue`, and executes in three phases: |
|||
- `PreExecute` |
|||
|
|||
`FlushTask` does nothing at this phase, and returns directly |
|||
|
|||
- `Execute` |
|||
|
|||
`Proxy` sends a `Flush` request to `DataCoord` via `Grpc`, and waits for the response, the `proto` is defined as follows: |
|||
```proto |
|||
service DataCoord { |
|||
... |
|||
rpc Flush(FlushRequest) returns (FlushResponse) {} |
|||
... |
|||
} |
|||
|
|||
message FlushRequest { |
|||
common.MsgBase base = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 4; |
|||
} |
|||
|
|||
message FlushResponse { |
|||
common.Status status = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 3; |
|||
repeated int64 segmentIDs = 4; |
|||
} |
|||
``` |
|||
- `PostExecute` |
|||
|
|||
`FlushTask` does nothing at this phase, and returns directly |
|||
|
|||
4. After receiving a `Flush` request from `Proxy`, `DataCoord` would call `SealAllSegments` to seal all the growing segments belonging to this `Collection`, and would not allocate new `ID`s for these segments anymore. After that, `DataCoord` would send a response to `Proxy`, which contains all the sealed segment `ID`s. |
|||
|
|||
5. In `Milvus 2.0`, `Flush` is an asynchronous operation. So when `SDK` receives the response of `Flush`, it only means that the `DataCoord` has sealed these segments. There are 2 problems that we have to solve. |
|||
- The sealed segments might still in memory, and have not been written into persistent storage yet. |
|||
- `DataCoord` would no longer allocate new `ID`s for these sealed segments, but how to make sure all the allocated `ID`s have been consumed by `DataNode`. |
|||
|
|||
|
|||
6. For the first problem, `SDK` should send `GetSegmentInfo` request to `DataCoord` periodically, until all sealed segments are in state of `Flushed`. The `proto` is defined as follows. |
|||
```proto |
|||
service DataCoord { |
|||
... |
|||
rpc GetSegmentInfo(GetSegmentInfoRequest) returns (GetSegmentInfoResponse) {} |
|||
... |
|||
} |
|||
|
|||
message GetSegmentInfoRequest { |
|||
common.MsgBase base = 1; |
|||
repeated int64 segmentIDs = 2; |
|||
} |
|||
|
|||
message GetSegmentInfoResponse { |
|||
common.Status status = 1; |
|||
repeated SegmentInfo infos = 2; |
|||
} |
|||
|
|||
message SegmentInfo { |
|||
int64 ID = 1; |
|||
int64 collectionID = 2; |
|||
int64 partitionID = 3; |
|||
string insert_channel = 4; |
|||
int64 num_of_rows = 5; |
|||
common.SegmentState state = 6; |
|||
msgpb.MsgPosition dml_position = 7; |
|||
int64 max_row_num = 8; |
|||
uint64 last_expire_time = 9; |
|||
msgpb.MsgPosition start_position = 10; |
|||
} |
|||
|
|||
enum SegmentState { |
|||
SegmentStateNone = 0; |
|||
NotExist = 1; |
|||
Growing = 2; |
|||
Sealed = 3; |
|||
Flushed = 4; |
|||
Flushing = 5; |
|||
} |
|||
|
|||
``` |
|||
|
|||
7. For the second problem, `DataNode` would report a timestamp to `DataCoord` every time it consumes a package from `MsgStream`, the `proto` is defined as follows. |
|||
|
|||
```proto |
|||
message DataNodeTtMsg { |
|||
common.MsgBase base = 1; |
|||
string channel_name = 2; |
|||
uint64 timestamp = 3; |
|||
} |
|||
``` |
|||
|
|||
8. There is a background service, `startDataNodeTsLoop`, in `DataCoord` to process the message of `DataNodeTtMsg`. |
|||
- Firstly, `DataCoord` would extract `channel_name` from `DataNodeTtMsg`, and filter out all sealed segments that are attached on this `channel_name` |
|||
- Compare the timestamp when the segment enters into state of `Sealed` with the `DataNodeTtMsg.timestamp`, if `DataNodeTtMsg.timestamp` is greater, which means that all `ID`s belonging to that segment have been consumed by `DataNode`, it's safe to notify `DataNode` to write that segment into persistent storage. The `proto` is defined as follows: |
|||
```proto |
|||
service DataNode { |
|||
... |
|||
rpc FlushSegments(FlushSegmentsRequest) returns(common.Status) {} |
|||
... |
|||
} |
|||
|
|||
message FlushSegmentsRequest { |
|||
common.MsgBase base = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 3; |
|||
repeated int64 segmentIDs = 4; |
|||
} |
|||
``` |
@ -0,0 +1,206 @@ |
|||
# Drop Collection |
|||
|
|||
`Milvus 2.0` uses `Collection` to represent a set of data, like `Table` in traditional database. Users can create or drop `Collection`. |
|||
This article introduces the execution path of `Drop Collection`. At the end of this article, you should know which components are involved in `Drop Collection`. |
|||
|
|||
The execution flow of `Drop Collection` is shown in the following figure: |
|||
|
|||
 |
|||
|
|||
1. Firstly, `SDK` sends a `DropCollection` request to `Proxy` via `Grpc`, the `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service MilvusService { |
|||
... |
|||
|
|||
rpc DropCollection(DropCollectionRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message DropCollectionRequest { |
|||
// Not useful for now |
|||
common.MsgBase base = 1; |
|||
// Not useful for now |
|||
string db_name = 2; |
|||
// Required, the collection name in milvus |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
2. Once the `DropCollection` request is received, the `Proxy` would wrap this request into `DropCollectionTask`, and push this task into `DdTaskQueue` queue. After that, `Proxy` would call `WaitToFinish` method to wait until the task is finished. |
|||
|
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
|
|||
type DropCollectionTask struct { |
|||
Condition |
|||
*milvuspb.DropCollectionRequest |
|||
ctx context.Context |
|||
rootCoord types.RootCoord |
|||
result *commonpb.Status |
|||
chMgr channelsMgr |
|||
chTicker channelsTimeTicker |
|||
} |
|||
``` |
|||
|
|||
3. There is a background service in `Proxy`, this service would get the `DropCollectionTask` from `DdTaskQueue`, and execute it in three phases: |
|||
|
|||
- `PreExecute`, do some static checking at this phase, such as check if `Collection Name` is legal etc. |
|||
- `Execute`, at this phase, `Proxy` would send `DropCollection` request to `RootCoord` via `Grpc`, and wait the response, the `proto` is defined as below: |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
|
|||
rpc DropCollection(milvus.DropCollectionRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
``` |
|||
|
|||
- `PostExecute`, `Proxy` would delete `Collection`'s meta from global meta table at this phase. |
|||
|
|||
4. `RootCoord` would wrap the `DropCollection` request into `DropCollectionReqTask`, and then call function `executeTask`. `executeTask` would return until the `context` is done or `DropCollectionReqTask.Execute` is returned. |
|||
|
|||
```go |
|||
type reqTask interface { |
|||
Ctx() context.Context |
|||
Type() commonpb.MsgType |
|||
Execute(ctx context.Context) error |
|||
Core() *Core |
|||
} |
|||
|
|||
type DropCollectionReqTask struct { |
|||
baseReqTask |
|||
Req *milvuspb.DropCollectionRequest |
|||
} |
|||
``` |
|||
|
|||
5. Firstly, `RootCoord` would delete `Collection`'s meta from `metaTable`, including `schema`,`partition`, `segment`,`index`. All of these delete operations are committed in one transaction. |
|||
|
|||
6. After `Collection`'s meta has been deleted from `metaTable`, `Milvus` would consider this collection has been deleted successfully. |
|||
|
|||
7. `RootCoord` would alloc a timestamp from `TSO` before deleting `Collection`'s meta from `metaTable`. This timestamp is considered as the point when the collection was deleted. |
|||
|
|||
8. `RootCoord` would send a message of `DropCollectionRequest` into `MsgStream`. Thus other components, who have subscribed to the `MsgStream`, would be notified. The `Proto` of `DropCollectionRequest` is defined as below: |
|||
|
|||
```proto |
|||
message DropCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collectionName = 3; |
|||
int64 dbID = 4; |
|||
int64 collectionID = 5; |
|||
} |
|||
|
|||
``` |
|||
|
|||
9. After these operations, `RootCoord` would update internal timestamp. |
|||
|
|||
10. Then `RootCoord` would start a `ReleaseCollection` request to `QueryCoord` via `Grpc` , notify `QueryCoord` to release all resources that related to this `Collection`. This `Grpc` request is done in another `goroutine`, so it would not block the main thread. The `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service QueryCoord { |
|||
... |
|||
|
|||
rpc ReleaseCollection(ReleaseCollectionRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message ReleaseCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 3; |
|||
int64 nodeID = 4; |
|||
} |
|||
``` |
|||
|
|||
11. At last, `RootCoord` would send `InvalidateCollectionMetaCache` request to each `Proxy`, notify `Proxy` to remove `Collection`'s meta. The `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service Proxy { |
|||
... |
|||
|
|||
rpc InvalidateCollectionMetaCache(InvalidateCollMetaCacheRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message InvalidateCollMetaCacheRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
12. The execution flow of `QueryCoord.ReleaseCollection` is shown in the following figure: |
|||
|
|||
 |
|||
|
|||
13. `QueryCoord` would wrap `ReleaseCollection` into `ReleaseCollectionTask`, and push the task into `TaskScheduler` |
|||
|
|||
14. There is a background service in `QueryCoord`. This service would get the `ReleaseCollectionTask` from `TaskScheduler`, and execute it in three phases: |
|||
|
|||
- `PreExecute`, `ReleaseCollectionTask` would only print debug log at this phase. |
|||
- `Execute`, there are two jobs at this phase: |
|||
|
|||
- send a `ReleaseDQLMessageStream` request to `RootCoord` via `Grpc`, `RootCoord` would redirect the `ReleaseDQLMessageStream` request to each `Proxy`, and notify the `Proxy` that stop processing any message of this `Collection` anymore. The `proto` is defined as follows: |
|||
|
|||
```proto |
|||
message ReleaseDQLMessageStreamRequest { |
|||
common.MsgBase base = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 3; |
|||
} |
|||
``` |
|||
|
|||
- send a `ReleaseCollection` request to each `QueryNode` via `Grpc`, and notify the `QueryNode` to release all the resources related to this `Collection`, including `Index`, `Segment`, `FlowGraph`, etc. `QueryNode` would no longer read any message from this `Collection`'s `MsgStream` anymore |
|||
|
|||
```proto |
|||
service QueryNode { |
|||
... |
|||
|
|||
rpc ReleaseCollection(ReleaseCollectionRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message ReleaseCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
int64 dbID = 2; |
|||
int64 collectionID = 3; |
|||
int64 nodeID = 4; |
|||
} |
|||
``` |
|||
|
|||
- `PostExecute`, `ReleaseCollectionTask` would only print debug log at this phase. |
|||
|
|||
15. After these operations, `QueryCoord` would send `ReleaseCollection`'s response to `RootCoord`. |
|||
|
|||
16. At `Step 8`, `RootCoord` has sent a message of `DropCollectionRequest` into `MsgStream`. `DataNode` would subscribe this `MsgStream`, so that it would be notified to release related resources. The execution flow is shown in the following figure. |
|||
|
|||
 |
|||
|
|||
17. In `DataNode`, each `MsgStream` will have a `FlowGraph`, which processes all messages. When the `DataNode` receives the message of `DropCollectionRequest`, `DataNode` would notify `BackGroundGC`, which is a background service on `DataNode`, to release resources. |
|||
|
|||
_Notes_: |
|||
|
|||
1. Currently, the `DataCoord` doesn't have response to the `DropCollection`. So the `Collection`'s `segment meta` still exists in the `DataCoord`'s `metaTable`, and the `Binlog` files belonging to this `Collection` still exist in the persistent storage. |
|||
2. Currently, the `IndexCoord` doesn't have response to the `DropCollection`. So the `Collection`'s `index file` still exists in the persistent storage. |
@ -0,0 +1,69 @@ |
|||
# Hybrid Timestamp in Milvus |
|||
|
|||
In chapter [Milvus TimeSync Mechanism](./milvus_timesync_en.md), we have already known why we need TSO in Milvus. Milvus |
|||
uses the [TiKV's](https://github.com/tikv/tikv) implementation into TSO. So if you are interested in how TSO is |
|||
implemented, you can look into the official documentation of TiKV. |
|||
|
|||
This chapter will only introduce two points: |
|||
|
|||
1. the organization of hybrid TSO in Milvus; |
|||
2. how should we parse the hybrid TSO; |
|||
|
|||
## The Organization of TSO |
|||
|
|||
The type of TSO is `uint64`. As shown in the figure below, TSO was organized by two parts: |
|||
|
|||
1. physical part; |
|||
2. logical part; |
|||
|
|||
The front 46 bits is of physical part, and the last 18 bits is of logical part. |
|||
|
|||
Note, physical part is the UTC time in Milliseconds. |
|||
|
|||
 |
|||
|
|||
For some users such as DBAs, they would want to sort the operations and list them in UTC time order. |
|||
|
|||
Actually, we can use the TSO order to sort the `Insert` operations or `Delete` operations. |
|||
|
|||
So the question becomes how we get the UTC time from TSO. |
|||
|
|||
As we have described above, the physical part consists of the front 46 bits of TSO. |
|||
|
|||
So given a TSO which is returned by `Insert` or `Delete`, we can directly shift the left 18 bits to get the UTC time. |
|||
|
|||
For example in Golang: |
|||
|
|||
```go |
|||
const ( |
|||
logicalBits = 18 |
|||
logicalBitsMask = (1 << logicalBits) - 1 |
|||
) |
|||
|
|||
// ParseTS parses the ts to (physical,logical). |
|||
func ParseTS(ts uint64) (time.Time, uint64) { |
|||
logical := ts & logicalBitsMask |
|||
physical := ts >> logicalBits |
|||
physicalTime := time.Unix(int64(physical/1000), int64(physical)%1000*time.Millisecond.Nanoseconds()) |
|||
return physicalTime, logical |
|||
} |
|||
``` |
|||
|
|||
In Python: |
|||
|
|||
```python |
|||
>>> import datetime |
|||
>>> LOGICAL_BITS = 18 |
|||
>>> LOGICAL_BITS_MASK = (1 << LOGICAL_BITS) - 1 |
|||
>>> def parse_ts(ts): |
|||
... logical = ts & LOGICAL_BITS_MASK |
|||
... physical = ts >> LOGICAL_BITS |
|||
... return physical, logical |
|||
... |
|||
>>> ts = 429164525386203142 |
|||
>>> utc_ts_in_milliseconds, _ = parse_ts(ts) |
|||
>>> d = datetime.datetime.fromtimestamp(utc_ts_in_milliseconds / 1000.0) |
|||
>>> d.strftime('%Y-%m-%d %H:%M:%S') |
|||
'2021-11-17 15:05:41' |
|||
>>> |
|||
``` |
@ -0,0 +1,132 @@ |
|||
# Timesync -- All The things you should know |
|||
|
|||
`Time Synchronization` is the kernel part of Milvus 2.0; it affects all components of the system. This document describes the detailed design of `Time Synchronization`. |
|||
|
|||
There are 2 kinds of events in Milvus 2.0: |
|||
|
|||
- DDL events |
|||
- create collection |
|||
- drop collection |
|||
- create partition |
|||
- drop partition |
|||
- DML events |
|||
- insert |
|||
- search |
|||
- etc |
|||
|
|||
All events have a `Timestamp` to indicate when this event occurs. |
|||
|
|||
Suppose there are two users, `u1` and `u2`. They connect to Milvus and do the following operations at the respective timestamps. |
|||
|
|||
| ts | u1 | u2 | |
|||
| --- | -------------------- | ------------ | |
|||
| t0 | create Collection C0 | - | |
|||
| t2 | - | search on C0 | |
|||
| t5 | insert A1 into C0 | - | |
|||
| t7 | - | search on C0 | |
|||
| t10 | insert A2 | - | |
|||
| t12 | - | search on C0 | |
|||
| t15 | delete A1 from C0 | - | |
|||
| t17 | - | search on C0 | |
|||
|
|||
Ideally, `u2` expects `C0` to be empty at `t2`, and could only see `A1` at `t7`; while `u2` could see both `A1` and `A2` at `t12`, but only see `A2` at `t17`. |
|||
|
|||
It's easy to achieve this in a `single-node` database. But for a `Distributed System`, such as `Milvus`, it's a little difficult; the following problems need to be solved: |
|||
|
|||
1. If `u1` and `u2` are on different nodes, and their time clock is not synchronized. To give an extreme example, suppose that the time of `u2` is 24 hours later than `u1`, then all the operations of `u1` can't be seen by `u2` until the next day. |
|||
2. Network latency. If `u2` starts the `Search on C0` at `t17`, then how can it be guaranteed that all the `events` before `t17` have been processed? If the events of `delete A1 from C0` have been delayed due to the network latency, then it would lead to an incorrect state: `u2` would see both `A1` and `A2` at `t17`. |
|||
|
|||
`Time synchronization system` is used to solve the above problems. |
|||
|
|||
## Timestamp Oracle(TSO) |
|||
|
|||
Like [TiKV](https://github.com/tikv/tikv), Milvus 2.0 provides `TSO` service. All the events must alloc timestamp from `TSO`, not from the local clock, so the first problem can be solved. |
|||
|
|||
`TSO` is provided by the `RootCoord` component. Clients could alloc one or more timestamp in a single request; the `proto` is defined as follows. |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
rpc AllocTimestamp(AllocTimestampRequest) returns (AllocTimestampResponse) {} |
|||
... |
|||
} |
|||
|
|||
message AllocTimestampRequest { |
|||
common.MsgBase base = 1; |
|||
uint32 count = 3; |
|||
} |
|||
|
|||
message AllocTimestampResponse { |
|||
common.Status status = 1; |
|||
uint64 timestamp = 2; |
|||
uint32 count = 3; |
|||
} |
|||
``` |
|||
|
|||
`Timestamp` is of type `uint64`, and contains physical and logical parts. |
|||
|
|||
This is the format of `Timestamp` |
|||
|
|||
 |
|||
|
|||
In an `AllocTimestamp` request, if `AllocTimestampRequest.count` is greater than `1`, `AllocTimestampResponse.timestamp` indicates the first available timestamp in the response. |
|||
|
|||
## Time Synchronization |
|||
|
|||
To better understand `Time Synchronization`, let's introduce the data operation of Milvus 2.0 briefly. |
|||
Take `Insert Operation` as an example. |
|||
|
|||
- User can configure lots of `Proxy` to achieve load balancing, in `Milvus 2.0` |
|||
- User can use `SDK` to connect to any `Proxy` |
|||
- When `Proxy` receives `Insert` Request from `SDK`, it splits `InsertMsg` into different `MsgStream` according to the hash value of `Primary Key` |
|||
- Each `InsertMsg` would be assigned with a `Timestamp` before sending to the `MsgStream` |
|||
|
|||
>*Note: `MsgStream` is the wrapper of message queue, the default message queue in `Milvus 2.0` is `pulsar`* |
|||
|
|||
 |
|||
|
|||
Based on the above information, we know that the `MsgStream` has the following characteristics: |
|||
|
|||
- In `MsgStream`, `InsertMsg` from the same `Proxy` must be incremented in timestamp |
|||
- In `MsgStream`, `InsertMsg` from different `Proxy` have no relationship in timestamp |
|||
|
|||
The following figure shows an example of `InsertMsg` in `MsgStream`. The snippet contains 5 `InsertMsg`, 3 of them from `Proxy1` and others from `Proxy2`. |
|||
|
|||
The 3 `InsertMsg` from `Proxy1` are incremented in timestamp, and the 2 `InsertMsg` from `Proxy2` are also incremented in timestamps, but there is no relationship between `Proxy1` and `Proxy2`. |
|||
|
|||
 |
|||
|
|||
So the second problem has turned into this: After reading a message from `MsgStream`, how to make sure that all the messages with smaller timestamp have been consumed? |
|||
|
|||
For example, when reading a message with timestamp `110` produced by `Proxy2`, but the message with timestamp `80` produced by `Proxy1`, is still in the `MsgStream`. How can this situation be handled? |
|||
|
|||
The following graph shows the core logic of `Time Synchronization System` in `Milvus 2.0`; it should solve the second problem. |
|||
|
|||
- Each `Proxy` will periodically report its latest timestamp of every `MsgStream` to `RootCoord`; the default interval is `200ms` |
|||
- For each `Msgstream`, `Rootcoord` finds the minimum timestamp of all `Proxy` on this `Msgstream`, and inserts this minimum timestamp into the `Msgstream` |
|||
- When the consumer reads the timestamp inserted by the `RootCoord` on the `MsgStream`, it indicates that all messages with smaller timestamp have been consumed, so all actions that depend on this timestamp can be executed safely |
|||
- The message inserted by `RootCoord` into `MsgStream` is of type `TimeTick` |
|||
|
|||
 |
|||
|
|||
This is the `Proto` that is used by `Proxy` to report timestamp to `RootCoord`: |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
rpc UpdateChannelTimeTick(internal.ChannelTimeTickMsg) returns (common.Status) {} |
|||
... |
|||
} |
|||
|
|||
message ChannelTimeTickMsg { |
|||
common.MsgBase base = 1; |
|||
repeated string channelNames = 2; |
|||
repeated uint64 timestamps = 3; |
|||
uint64 default_timestamp = 4; |
|||
} |
|||
``` |
|||
|
|||
After inserting `Timetick`, the `Msgstream` should look like this: |
|||
 |
|||
|
|||
`MsgStream` will process the messages in batches according to `TimeTick`, and ensure that the output messages meet the requirements of timestamp. For more details, please refer to the `MsgStream` design details. |
@ -0,0 +1,140 @@ |
|||
# Create Collection |
|||
|
|||
`Milvus 2.0` uses `Collection` to represent a set of data, like `Table` in a traditional database. User can create or drop `Collection`. |
|||
This article introduces the execution path of `CreateCollection`, at the end of this article, you should know which components are involved in `CreateCollection`. |
|||
|
|||
The execution flow of `CreateCollection` is shown in the following figure: |
|||
|
|||
 |
|||
|
|||
1. Firstly, `SDK` starts a `CreateCollection` request to `Proxy` via `Grpc`, the `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service MilvusService { |
|||
... |
|||
|
|||
rpc CreateCollection(CreateCollectionRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message CreateCollectionRequest { |
|||
// Not useful for now |
|||
common.MsgBase base = 1; |
|||
// Not useful for now |
|||
string db_name = 2; |
|||
// The unique collection name in milvus.(Required) |
|||
string collection_name = 3; |
|||
// The serialized `schema.CollectionSchema`(Required) |
|||
bytes schema = 4; |
|||
// Once set, no modification is allowed (Optional) |
|||
// https://github.com/milvus-io/milvus/issues/6690 |
|||
int32 shards_num = 5; |
|||
} |
|||
|
|||
message CollectionSchema { |
|||
string name = 1; |
|||
string description = 2; |
|||
bool autoID = 3; // deprecated later, keep compatible with c++ part now |
|||
repeated FieldSchema fields = 4; |
|||
} |
|||
|
|||
``` |
|||
|
|||
2. When receiving the `CreateCollection` request, `Proxy` would wrap this request into `CreateCollectionTask`, and pushes this task into `DdTaskQueue` queue. After that, `Proxy` would call `WaitToFinish` method to wait until the task is finished. |
|||
|
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
|
|||
type createCollectionTask struct { |
|||
Condition |
|||
*milvuspb.CreateCollectionRequest |
|||
ctx context.Context |
|||
rootCoord types.RootCoord |
|||
result *commonpb.Status |
|||
schema *schemapb.CollectionSchema |
|||
} |
|||
``` |
|||
|
|||
3. There is a background service in `Proxy`, this service would get the `CreateCollectionTask` from `DdTaskQueue`, and execute it in three phases. |
|||
|
|||
- `PreExecute`, do some static checking at this phase, such as check if `Collection Name` and `Field Name` are legal, if there are duplicate columns, etc. |
|||
- `Execute`, at this phase, `Proxy` would send `CreateCollection` request to `RootCoord` via `Grpc`, and wait for response, the `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
|
|||
rpc CreateCollection(milvus.CreateCollectionRequest) returns (common.Status){} |
|||
|
|||
... |
|||
} |
|||
``` |
|||
|
|||
- `PostExecute`, `CreateCollectionTask` does nothing at this phase, and return directly. |
|||
|
|||
4. `RootCoord` would wrap the `CreateCollection` request into `CreateCollectionReqTask`, and then call function `executeTask`. `executeTask` would return until the `context` is done or `CreateCollectionReqTask.Execute` is returned. |
|||
|
|||
```go |
|||
type reqTask interface { |
|||
Ctx() context.Context |
|||
Type() commonpb.MsgType |
|||
Execute(ctx context.Context) error |
|||
Core() *Core |
|||
} |
|||
|
|||
type CreateCollectionReqTask struct { |
|||
baseReqTask |
|||
Req *milvuspb.CreateCollectionRequest |
|||
} |
|||
``` |
|||
|
|||
5. `CreateCollectionReqTask.Execute` would alloc `CollectionID` and default `PartitionID`, and set `Virtual Channel` and `Physical Channel`, which are used by `MsgStream`, then write the `Collection`'s meta into `metaTable` |
|||
|
|||
6. After `Collection`'s meta written into `metaTable`, `Milvus` would consider this collection has been created successfully. |
|||
|
|||
7. `RootCoord` would alloc a timestamp from `TSO` before writing `Collection`'s meta into `metaTable`, and this timestamp is considered as the point when the collection was created |
|||
|
|||
8. At last `RootCoord` will send a message of `CreateCollectionRequest` into `MsgStream`, and other components, who have subscribed to the `MsgStream`, would be notified. The `Proto` of `CreateCollectionRequest` is defined as follows: |
|||
|
|||
```proto |
|||
message CreateCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collectionName = 3; |
|||
string partitionName = 4; |
|||
int64 dbID = 5; |
|||
int64 collectionID = 6; |
|||
int64 partitionID = 7; |
|||
// `schema` is the serialized `schema.CollectionSchema` |
|||
bytes schema = 8; |
|||
repeated string virtualChannelNames = 9; |
|||
repeated string physicalChannelNames = 10; |
|||
} |
|||
|
|||
``` |
|||
|
|||
9. After the above operations, `RootCoord` would update the internal timestamp and return, so `Proxy` would get the response. |
|||
|
|||
_Notes:_ |
|||
|
|||
1. In `Proxy`, all `DDL` requests will be wrapped into `task`, and push the `task` into `DdTaskQueue`. |
|||
A background service will read a new `task` from `DdTaskQueue` only when the previous one is finished. |
|||
So all the `DDL` requests are executed serially on `Proxy`. |
|||
|
|||
2. In `RootCoord`, all `DDL` requests will be wrapped into `reqTask`, but there is no task queue, so the `DDL` requests will be executed in parallel on `RootCoord`. |
@ -0,0 +1,105 @@ |
|||
# Support to retrieve the specified entity from a collection |
|||
|
|||
## Background |
|||
|
|||
In Milvus, a collection has multiple fields, mainly there are two kinds of fields: vector field and scalar field. We call a row an entity, one entity encapsulates multiple vectors and scalar values. |
|||
|
|||
When creating a collection, you can specify using the auto-generated primary key, or using the user-provided primary key. |
|||
If a user sets to use the user-provided primary key, each entity inserted must contain the primary key field. Otherwise, the insertion fails. |
|||
The primary keys will be returned after the insertion request is successful. |
|||
|
|||
Milvus currently only supports primary keys of the int64 type. |
|||
|
|||
QueryNode subscribes to the insert channel and will determine whether to use the data extracted from the insert channel or data processed by DataNode to provide services according to the status of a segment. |
|||
|
|||
## Goals |
|||
|
|||
- Support to retrieve one or more entities from a collection through primary keys |
|||
- Support to retrieve only some fields of an entity |
|||
- Consider backward file format compatibility if a new file is defined |
|||
|
|||
## Non-Goals |
|||
|
|||
- How to deal with duplicate primary keys |
|||
- How to retrieve entity by non-primary key |
|||
|
|||
## Detailed design |
|||
|
|||
When the DataNode processes each inserted entity, it updates the bloomfilter of the Segment to which the entity belongs. If it does not exist, it creates a bloomfilter in memory and updates it. |
|||
|
|||
Once DataNode receives a Flush command from DataCoord, it sorts the data in the segment in ascending order of primary key, records the maximum and minimum values of a primary key, and writes the segment, statistics and bloomfilter to the storage system. |
|||
|
|||
- Key of binlog file: `${tenant}/insert_log/${collection_id}/${partition_id}/${segment_id}/${field_id}/_${log_idx}` |
|||
- Key of statistics file: `${tenant}/insert_log/${collection_id}/${partition_id}/${segment_id}/${field_id}/stats_${log_idx}` |
|||
- Key of bloom filter file: `${tenant}/insert_log/${collection_id}/${partition_id}/${segment_id}/${field_id}/bf_${log_idx}` |
|||
|
|||
QueryNode maintains a mapping from primary key to entities in each segment. This mapping updates every time an insert request is processed. |
|||
|
|||
After receiving the Get request from the client, the Proxy sends the request to the `search` channel and waits for the result returned from the `searchResult` channel. |
|||
|
|||
The processing flow after QueryNode reads the Get request from `search` channel: |
|||
|
|||
1. Search for the existence of the requested primary key in the mapping of the `Growing` status segment, and return directly if found; |
|||
2. If the primary key exists in any mapping of `Growing` segments, return the results; |
|||
3. Load statistics and bloomfilter of all `Sealed` segments; |
|||
4. Convert the statistics into an inverted index from Range to SegmentID for each `Sealed` segment; |
|||
5. Check whether the requested primary key exists in any inverted index of `Sealed` segment, return empty if not found; |
|||
6. [optional] Use the bloomfilter to filter out segments where the primary key does not exist; |
|||
7. Use binary search to find the specified entity in each segment where the primary key may exist; |
|||
|
|||
### APIs |
|||
|
|||
```go |
|||
// pseudo-code |
|||
func get(collection_name string, |
|||
ids list[string], |
|||
output_fields list[string], |
|||
partition_names list[string]) (list[entity], error) |
|||
// Example |
|||
// entities = get("collection1", ["103"], ["_id", "age"], nil) |
|||
``` |
|||
|
|||
When the primary key does not exist in specified collection( and partitions), Milvus will return an empty result, which is not considered as an error. |
|||
|
|||
### Storage |
|||
|
|||
Both bloomfilter files and statistical information files belong to Binlog files and follow the Binlog file format. |
|||
|
|||
https://github.com/milvus-io/milvus/blob/master/docs/developer_guides/chap08_binlog.md |
|||
|
|||
Two new types of Binlog are added: BFBinlog and StatsBinlog. |
|||
|
|||
BFBinlog Payload: Refer to https://github.com/milvus-io/milvus/blob/1.1/core/src/segment/SegmentWriter.h for storage methods |
|||
|
|||
StatsBinlog Payload: Json format string, currently only contains the keys `max`, `min`. |
|||
|
|||
## Impact |
|||
|
|||
### API |
|||
|
|||
- A new Get API |
|||
- DropCollection / ReleaseCollection / DropPartition / ReleasePartition requests need to clear the corresponding statistics files and bloomfilter files in the memory |
|||
|
|||
### Storage |
|||
|
|||
- The name of the binlog file has been changed from `${log_idx}` to `_${log_idx}` |
|||
- Each binlog adds a stats file |
|||
- Each binlog adds a bloomfilter file |
|||
|
|||
## Test Plan |
|||
|
|||
### Testcase 1 |
|||
|
|||
In the newly created collection, insert an entity with a primary key of 107, call the Get interface to query the entity with a primary key of 107, and each field of the retrieved entity is exactly the same as the inserted entity. |
|||
|
|||
### Testcase 2 |
|||
|
|||
In the newly created collection, insert a record with a primary key of 107, call the Get interface to query the record with a primary key of 106, and the retrieved record is empty. |
|||
|
|||
### Testcase 3 |
|||
|
|||
In the newly created collection, insert the records with the primary keys of 105, 106, 107, call the Get interface to query the records with the primary keys of 101, 102, 103, 104, 105, 106, 107, the retrieved result only contains the records with primary keys of 105, 106, 107. |
|||
|
|||
### Testcase 4 |
|||
|
|||
In the newly created collection, insert a record with a primary key of 107, call the Flush interface, and check whether there are stats and bloomfilter files on MinIO. |
@ -0,0 +1,90 @@ |
|||
# What's Knowhere |
|||
|
|||
## Concepts |
|||
|
|||
Vector index is a time-efficient and space-efficient data structure built on vectors through a certain mathematical model. Through the vector index, we can efficiently query several vectors similar to the target vector. |
|||
Since accurate retrieval is usually very time-consuming, most of the vector index types of Milvus use ANNS (Approximate Nearest Neighbors Search). Compared with accurate retrieval, the core idea of ANNS is no longer limited to returning the most accurate result, but only searching for neighbors of the target. ANNS improves retrieval efficiency by sacrificing accuracy within an acceptable range. |
|||
|
|||
## What can Knowhere do |
|||
|
|||
Knowhere is the vector search execution engine of Milvus. It encapsulates many popular vector index algorithm libraries, such as faiss, hnswlib, NGT, annoy, and provides a set of unified interfaces. In addition, Knowhere also supports heterogeneous computing. |
|||
|
|||
## Framework |
|||
|
|||
 |
|||
|
|||
For more index types and heterogeneous support, please refer to the vector index document. |
|||
|
|||
## Major Interface |
|||
|
|||
```C++ |
|||
/* |
|||
* Serialize |
|||
* @return: serialization data |
|||
*/ |
|||
BinarySet |
|||
Serialize(); |
|||
|
|||
/* |
|||
* Load from serialization data |
|||
* @param [in] dataset_ptr: serialization data |
|||
*/ |
|||
void |
|||
Load(const BinarySet&); |
|||
|
|||
/* |
|||
* Create index |
|||
* @param [in] dataset_ptr: index data (key of the Dataset is "tensor", "rows" and "dim") |
|||
* @parma [in] config: index param |
|||
*/ |
|||
void |
|||
BuildAll(const DatasetPtr& dataset_ptr, const Config& config); |
|||
|
|||
/* |
|||
* KNN (K-Nearest Neighbors) Query |
|||
* @param [in] dataset_ptr: query data (key of the Dataset is "tensor" and "rows") |
|||
* @parma [in] config: query param |
|||
* @parma [out] blacklist: mark for deletion |
|||
* @return: query result (key of the Dataset is "ids" and "distance") |
|||
*/ |
|||
DatasetPtr |
|||
Query(const DatasetPtr& dataset_ptr, const Config& config, BitsetView blacklist); |
|||
|
|||
/* |
|||
* Copy the index from GPU to CPU |
|||
* @return: CPU vector index |
|||
* @notes: Only valid of the GPU indexes |
|||
*/ |
|||
VecIndexPtr |
|||
CopyGpuToCpu(); |
|||
|
|||
/* |
|||
* If the user IDs has been set, they will be returned in the Query interface; |
|||
* else the range of the returned IDs is [0, row_num-1]. |
|||
* @parma [in] uids: user ids |
|||
*/ |
|||
void |
|||
SetUids(std::shared_ptr<std::vector<IDType>> uids); |
|||
|
|||
/* |
|||
* Get the size of the index in memory. |
|||
* @return: index memory size |
|||
*/ |
|||
int64_t |
|||
Size(); |
|||
``` |
|||
|
|||
## Data Format |
|||
|
|||
The vector data used for index and query is stored as a one-dimensional array. |
|||
The first `dim * sizeof(data_type)` bytes of the array is the first vector; then `row_num -1` vectors are followed. |
|||
|
|||
## Sequence |
|||
|
|||
### Create index |
|||
|
|||
 |
|||
|
|||
### Query |
|||
|
|||
 |
@ -0,0 +1,89 @@ |
|||
# DropCollection release resources |
|||
|
|||
## Before this enhancement |
|||
|
|||
**When dropping a collection** |
|||
|
|||
1. DataNode releases the flowgraph of this collection and drops all the data in a buffer. |
|||
2. DataCoord has no idea whether a collection is dropped or not. |
|||
- DataCoord will make DataNode watch DmChannels of dropped collections. |
|||
- Blob files will never be removed even if the collection is dropped. |
|||
|
|||
**For not in used binlogs on blob storage: Why are there such binlogs** |
|||
- A failure flush. |
|||
- A failure compaction. |
|||
- Dropped and out-of timetravel collection binlogs. |
|||
|
|||
This enhancement is focused on solving these 2 problems. |
|||
|
|||
## Object1 DropCollection |
|||
|
|||
DataNode ignites Flush&Drop |
|||
receive drop collection msg -> |
|||
cancel compaction -> |
|||
flush all insert buffer and delete buffer -> |
|||
release the flowgraph |
|||
|
|||
**Plan 1: Picked** |
|||
|
|||
Add a `dropped` flag in `SaveBinlogPathRequest` proto. |
|||
|
|||
DataNode |
|||
- Flush all segments in this vChannel, When Flush&Drop, set the `dropped` flag true. |
|||
- If fails, retry at most 10 times and restart. |
|||
|
|||
DataCoord |
|||
- DataCoord marks segmentInfo as `dropped`, doesn't remove segmentInfos from etcd. |
|||
- When recovery, check if the segments in the vchannel are all dropped. |
|||
- if not, recover before the drop. |
|||
- if so, no need to recover the vchannel. |
|||
|
|||
Pros: |
|||
1. The easiest approach in both DataNode and DataCoord. |
|||
2. DataNode can reuse the current flush manager procedure. |
|||
Cons: |
|||
1. The No. rpc call is equal to the No. segments in a collection, expensive. |
|||
|
|||
--- |
|||
|
|||
**Plan 2: Enhance later** |
|||
|
|||
Add a new rpc `FlushAndDrop`, it's a vchannel scope rpc. |
|||
|
|||
Pros: |
|||
1. much lesser rpc calls, equal to shard-numbers. |
|||
2. More clarity of flush procedure in DataNode. |
|||
Cons: |
|||
1. More efforts in DataNode and DataCoord. |
|||
|
|||
``` |
|||
message FlushAndDropRequest { |
|||
common.MsgBase base = 1; |
|||
string channelID = 2; |
|||
int64 collectionID = 3; |
|||
repeated SegmentBinlogPaths segment_binlog_paths = 6; |
|||
} |
|||
|
|||
message SegmentBinlogPaths { |
|||
int64 segmentID = 1; |
|||
CheckPoint checkPoint = 2; |
|||
repeated FieldBinlog field2BinlogPaths = 2; |
|||
repeated FieldBinlog field2StatslogPaths = 3; |
|||
repeated DeltaLogInfo deltalogs = 4; |
|||
} |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## Object2: DataCoord Garbage Collection (GC) for not in used binlogs |
|||
|
|||
### How to clear unknown binlogs? |
|||
DataCoord runs a background GC goroutine, triggers every 1 day: |
|||
1. Get all minIO/S3 paths(keys). |
|||
2. Filter out keys not in segmentInfo. |
|||
3. According to the meta of blobs from minIO/S3, remove binlogs that exist more than 1 day. |
|||
- **Why 1 day: **Maybe there are newly uploaded binlogs from flush/compaction |
|||
|
|||
### How to clear dropped-collection's binlogs? |
|||
- DataCoord checks all dropped-segments, removes the binlogs recorded if they've been dropped by 1 day. |
|||
- DataCoord keeps the etcd segmentInfo meta. |
@ -0,0 +1,266 @@ |
|||
# Create Index |
|||
|
|||
`Index system` is the core part of `Milvus`, which is used to speed up the searches, this document introduces which components are involved in `Create Index`,and what these components do. |
|||
|
|||
The execution flow of `Create Index` is shown in the following figure: |
|||
|
|||
 |
|||
|
|||
1. Firstly, `SDK` starts a `CreateIndex` request to `Proxy` via `Grpc`, the `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service MilvusService { |
|||
... |
|||
|
|||
rpc CreateIndex(CreateIndexRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message CreateIndexRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string field_name = 4; |
|||
int64 dbID = 5; |
|||
int64 collectionID = 6; |
|||
int64 fieldID = 7; |
|||
repeated common.KeyValuePair extra_params = 8; |
|||
} |
|||
``` |
|||
|
|||
2. When received the `CreateIndex` request, the `Proxy` would wrap this request into `CreateIndexTask`, and push this task into `DdTaskQueue` queue. After that, `Proxy` would call method of `WatiToFinish` to wait until the task finished. |
|||
|
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
|
|||
type createIndexTask struct { |
|||
Condition |
|||
*milvuspb.CreateIndexRequest |
|||
ctx context.Context |
|||
rootCoord types.RootCoord |
|||
result *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
3. There is a background service in `Proxy`, this service would get the `CreateIndexTask` from `DdTaskQueue`, and execute it in three phases. |
|||
|
|||
- `PreExecute`, do some static checking at this phase, such as check if the index param is legal, etc. |
|||
- `Execute`, at this phase, `Proxy` would send `CreateIndex` request to `RootCoord` via `Grpc`, and wait the response, the `proto` is defined as the following: |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
|
|||
rpc CreateIndex(milvus.CreateIndexRequest) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
``` |
|||
|
|||
- `PostExecute`, `CreateIndexTask` does nothing at this phase, and returns directly. |
|||
|
|||
4. `RootCoord` would wrap the `CreateIndex` request into `CreateIndexReqTask`, and then call function `executeTask`. `executeTask` would return until the `context` is done or `CreateIndexReqTask.Execute` returned. |
|||
|
|||
```go |
|||
type reqTask interface { |
|||
Ctx() context.Context |
|||
Type() commonpb.MsgType |
|||
Execute(ctx context.Context) error |
|||
Core() *Core |
|||
} |
|||
|
|||
type CreateIndexReqTask struct { |
|||
baseReqTask |
|||
Req *milvuspb.CreateIndexRequest |
|||
} |
|||
``` |
|||
|
|||
5. According to the index type and index parameters, `RootCoord` lists all the `Segments` that need to be indexed on this `Collection`. `RootCoord` would only check those `Segments` which have been flushed at this stage. We will describe how to deal with those newly added segments and growing segments later. |
|||
|
|||
6. For each `Segment`, `RootCoord` would start a `Grpc` request to `DataCoord` to get `Binlog` paths of that `Segment`, the `proto` is defined as following: |
|||
|
|||
```proto |
|||
service DataCoord { |
|||
... |
|||
|
|||
rpc GetInsertBinlogPaths(GetInsertBinlogPathsRequest) returns (GetInsertBinlogPathsResponse) {} |
|||
|
|||
... |
|||
|
|||
} |
|||
|
|||
message GetInsertBinlogPathsRequest { |
|||
common.MsgBase base = 1; |
|||
int64 segmentID = 2; |
|||
} |
|||
|
|||
message GetInsertBinlogPathsResponse { |
|||
repeated int64 fieldIDs = 1; |
|||
repeated internal.StringList paths = 2; |
|||
common.Status status = 3; |
|||
} |
|||
|
|||
``` |
|||
|
|||
7. After getting the `Segment`'s `Binlog` paths, `RootCoord` would send a `Grpc` request to `IndexCoord`, ask `IndexCoord` to build index on this `Segment`, the `proto` is defined as the follow: |
|||
|
|||
```proto |
|||
service IndexCoord { |
|||
... |
|||
|
|||
rpc BuildIndex(BuildIndexRequest) returns (BuildIndexResponse){} |
|||
|
|||
... |
|||
} |
|||
|
|||
message BuildIndexRequest { |
|||
int64 indexBuildID = 1; |
|||
string index_name = 2; |
|||
int64 indexID = 3; |
|||
repeated string data_paths = 5; |
|||
repeated common.KeyValuePair type_params = 6; |
|||
repeated common.KeyValuePair index_params = 7; |
|||
} |
|||
|
|||
message BuildIndexResponse { |
|||
common.Status status = 1; |
|||
int64 indexBuildID = 2; |
|||
} |
|||
|
|||
``` |
|||
|
|||
8. The execution flow of `BuildIndex` on `IndexCoord` is shown in the following figure |
|||
|
|||
 |
|||
|
|||
9. `IndexCoord` would wrap the `BuildIndex` request into `IndexAddTask`, then alloc a global unique ID as `IndexBuildID`, and write this `Segment`'s `index mate` into `IndexCoord`'s `metaTable`. When finish these operations, `IndexCoord` would send response to `RootCoord`, the response includes the `IndexBuildID`. |
|||
|
|||
10. When `RootCoood` receives the `BuildIndexResponse`, it would extract the `IndexBuildID` from the response, update `RootCoord`'s `metaTable`, then send responses to `Proxy`. |
|||
|
|||
11. There is a background service, `assignTaskLoop`, in `IndexCoord`. `assignTaskLoop` would call `GetUnassignedTask` periodically, the default interval is 3s. `GetUnassignedTask` would list these segments whose `index meta` has been updated, but index has not been created yet. |
|||
|
|||
12. The previous step has listed the segments whose index has not been created, for each those segments, `IndexCoord` would call `PeekClient` to get an available `IndexNode`, and send `CreateIndex` request to this `IndexNode`. The `proto` is defined as follows. |
|||
|
|||
```proto |
|||
service IndexNode { |
|||
... |
|||
|
|||
rpc CreateIndex(CreateIndexRequest) returns (common.Status){} |
|||
|
|||
... |
|||
} |
|||
|
|||
message CreateIndexRequest { |
|||
int64 indexBuildID = 1; |
|||
string index_name = 2; |
|||
int64 indexID = 3; |
|||
int64 version = 4; |
|||
string meta_path = 5; |
|||
repeated string data_paths = 6; |
|||
repeated common.KeyValuePair type_params = 7; |
|||
repeated common.KeyValuePair index_params = 8; |
|||
} |
|||
``` |
|||
|
|||
13. When receiving `CreateIndex` request, `IndexNode` would wrap this request into `IndexBuildTask`, and push this task into `IndexBuildQueue`, then send response to `IndexCoord`. |
|||
|
|||
14. There is a background service, `indexBuildLoop`, in the `IndexNode`. `indexBuildLoop` would call `scheduleIndexBuildTask` to get an `IndexBuildTask` from `IndexBuildQueue`, and then start another `goroutine` to build index and update meta. |
|||
|
|||
_Note_: `IndexNode` will not notify the `QueryCoord` to load the index files, if a user wants to speed up search by these index files, he should call `ReleaseCollection` firstly, then call `LoadCollection` to load these index files. |
|||
|
|||
15. As mentioned earlier, `RootCoord` would only search on these flushed segments on `CreateIndex` request, the following figure shows how to deal with the newly added segments. |
|||
|
|||
 |
|||
|
|||
16. When a segment has been flushed, `DataCoord` would notify `RootCoord` via `SegmentFlushCompleted`, the `proto` is defined as follows: |
|||
|
|||
```proto |
|||
service RootCoord { |
|||
... |
|||
|
|||
rpc SegmentFlushCompleted(data.SegmentFlushCompletedMsg) returns (common.Status) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message SegmentFlushCompletedMsg { |
|||
common.MsgBase base = 1; |
|||
SegmentInfo segment = 2; |
|||
} |
|||
|
|||
message SegmentInfo { |
|||
int64 ID = 1; |
|||
int64 collectionID = 2; |
|||
int64 partitionID = 3; |
|||
string insert_channel = 4; |
|||
int64 num_of_rows = 5; |
|||
common.SegmentState state = 6; |
|||
int64 max_row_num = 7; |
|||
uint64 last_expire_time = 8; |
|||
msgpb.MsgPosition start_position = 9; |
|||
msgpb.MsgPosition dml_position = 10; |
|||
repeated FieldBinlog binlogs = 11; |
|||
} |
|||
|
|||
``` |
|||
|
|||
17. If a user has called `CreateIndex` on this `Collection`, then when `RootCoord` receives `SegmentFlushCompleted` request, it would extract the `SegmentID` from the request, and send a `GetInsertBinlogPaths` request to `DataCoord` to get the `Binlog` paths, finally `RootCoord` would send a `BuildIndex` request to `IndexCoord` to notify `IndexCoord` to build index on this segment. |
|||
|
|||
18. The `Grpc` call of `SegmentFlushCompleted` might be failed due to network problem or some others, so how to create an index if the `Grpc` failed ? The following figure shows the solution. |
|||
|
|||
 |
|||
|
|||
19. There is a background service, `checkFlushedSegmentLoop`, in `RootCoord`. `checkFlushedSegmentLoop` would periodically check whether there is a segment that needs to be created index but has not been created, the default interval is `10 minutes`, and call `DataCoord` and `IndexCoord`'s service to create index on these segments. |
|||
|
|||
20. In `Milvus 2.0`, `Create Index` is an asynchronous operation, so the `SDK` needs to send `GetIndexStates` request to `IndexCoord` periodically to check if the index has been created, the `proto` is defined as follows. |
|||
|
|||
```proto |
|||
service IndexCoord { |
|||
... |
|||
|
|||
rpc GetIndexStates(GetIndexStatesRequest) returns (GetIndexStatesResponse) {} |
|||
|
|||
... |
|||
} |
|||
|
|||
message GetIndexStatesRequest { |
|||
repeated int64 indexBuildIDs = 1; |
|||
} |
|||
|
|||
message GetIndexStatesResponse { |
|||
common.Status status = 1; |
|||
repeated IndexInfo states = 2; |
|||
} |
|||
|
|||
message IndexInfo { |
|||
common.IndexState state = 1; |
|||
int64 indexBuildID = 2; |
|||
int64 indexID = 3; |
|||
string index_name = 4; |
|||
string reason = 5; |
|||
} |
|||
|
|||
enum IndexState { |
|||
IndexStateNone = 0; |
|||
Unissued = 1; |
|||
InProgress = 2; |
|||
Finished = 3; |
|||
Failed = 4; |
|||
} |
|||
``` |
@ -0,0 +1,507 @@ |
|||
## 6. Proxy |
|||
|
|||
As the user access layer of Milvus, Proxy mainly plays a role that does some checks and preprocessing for requests from |
|||
clients and then forwards these requests to other components, such as Root Coordinator, Data Coordinator, Query |
|||
Coordinator, Index Coordinator. The below figure shows how Proxy interacts with other components. |
|||
|
|||
<img src="./graphs/proxy.png" width=700> |
|||
|
|||
Proxy divides requests from clients into three classes: `DdRequest`, `DmRequest`, `DqRequest`. |
|||
|
|||
DdRequest is the shorthand of `Data Definition Request`. It means an operation on the meta information of |
|||
collections, including two parts. One part is of the writing operations on collections, such as defining a schema, |
|||
creating or dropping a partition, creating or dropping the index, etc. Another part is of the reading operations on |
|||
collections, such as listing all collections or partitions, checking if a collection or a partition exists. |
|||
|
|||
DmRequest means `Data Manipulation Request`. These requests perform a writing operation on collections, including |
|||
inserting records into a collection, deleting some specific records of a collection. |
|||
|
|||
DqRequest means `Data Query Request`. These requests perform a reading operation on collections, such as searching on a |
|||
collection, querying for specific records of a collection, etc. |
|||
|
|||
For every request, Proxy will first check if it's valid to be executed by Milvus and if the request is invalid then |
|||
Proxy will return the error to clients and won't continue to forward this request to other components. The check |
|||
operation of Proxy includes two parts, one part is static check and another is dynamic check. The static check includes |
|||
parameters check, constraints check, etc. The dynamic check will check some related dependencies of the request, take |
|||
search requests as an example, Proxy should check if the related collection exists in Milvus. |
|||
|
|||
Also, Proxy will do some preprocessing for every request. Proxy will do little things for some requests in the |
|||
preprocessing stage and a lot more for other requests. Every object in Milvus will be assigned with an `ID`, such as |
|||
`CollectionID`, `PartitionID`, `IndexID`, `SegmentID`, etc. Components in Milvus communicate with each other by the |
|||
object IDs, however, users only know the object name. So as a user access layer of Milvus, Proxy should translate |
|||
the object name into object ID. Also taking search request as an example, Proxy should translate the `CollectionName` into |
|||
`CollectionID` and then the Query Node will recognize the request. Proxy holds a cache that translates object name into |
|||
object id and dynamically updates the cache. |
|||
|
|||
#### 6.0 Service Discovery based on etcd |
|||
|
|||
As you know, Proxy depends on some other components. So how Proxy knows the other nodes' information such as host and port, |
|||
it is also called how Milvus implements the service discovery mechanism? As a cloud-native vector database, Milvus uses |
|||
etcd to provide service registration and service discovery. Every node in Milvus registers its node information(including host, |
|||
port, ID etc) into etcd after startup. Nodes should specify the prefix and key of etcd when registration. Nodes with the |
|||
same type have the same prefix in etcd, while nodes with different types have different prefixes. Every key in etcd can be assigned with a lease, and you can specify the `time-to-live(ttl)` of the |
|||
lease. Milvus uses this mechanism to check if a node is online. When a node is healthy, it will continuously renew the |
|||
lease in etcd. Otherwise, if some exceptions occurred in a node, or a node stopped to renew the lease, etcd would delete |
|||
the related node information. By using this mechanism, nodes in Milvus know if there are any other nodes going to be |
|||
online or offline and synchronize the latest node information. |
|||
|
|||
#### 6.1 Interaction with Root Coordinator |
|||
|
|||
Proxy will forward the DdRequest to Root Coordinator. These requests include: |
|||
|
|||
- CreateCollection |
|||
- DropCollection |
|||
- HasCollection |
|||
- DescribeCollection |
|||
- ShowCollections |
|||
- CreatePartition |
|||
- DropPartition |
|||
- HasPartition |
|||
- ShowPartitions |
|||
- CreateIndex |
|||
- DropIndex |
|||
- DescribeIndex |
|||
- GetIndexBuildProgress |
|||
- GetIndexState |
|||
|
|||
Proxy handles the DdRequest sequentially. When and only when the earlier entered requests are done, the next request |
|||
would be handled. Proxy forwards these requests to Root Coordinator, waits until getting results from Root Coordinator, and then |
|||
returns to clients with results or errors. |
|||
|
|||
Milvus does not support transactions, but it should guarantee the deterministic execution of every operation. A timestamp |
|||
is tagged on each request. When a request enters Milvus, Proxy tags a timestamp that was assigned by Root Coordinator. |
|||
The component that assigns timestamp in Root Coordinator is called `Timestamp Oracle (TSO)`. TSO ensures that each |
|||
timestamp is globally increasing. |
|||
|
|||
Milvus 2.0 implements the unified Lambda architecture, which integrates the processing of the incremental and historical |
|||
data. Compared with the Kappa architecture, Milvus 2.0 introduces log backfill, which stores log snapshots and indexes |
|||
in the object storage to improve failure recovery efficiency and query performance. To break unbounded (stream) data |
|||
down into bounded windows, Milvus embraces a new watermark mechanism, which slices the stream data into multiple message |
|||
packs according to write time or event time, and maintains a timeline for users to query by time. |
|||
|
|||
To support this watermark mechanism, Proxy should report the timestamp statistics of the physical channel to Root |
|||
Coordinator periodically. When Proxy knows all operations of a specific were done before a `ts`, then Proxy will report |
|||
the `ts` and inform Root Coordinator that updates the timestamp statistics. |
|||
|
|||
Proxy holds a cache about meta information of collections. The meta information includes `CollectionID`, `Schema`, |
|||
`PartitionID`, etc. Components in Milvus communicate with each other using `CollectionID` and `PartitionID`, so the |
|||
object name in a request will be translated to object ID in Proxy. When the meta is not hit in the cache, Proxy will update |
|||
the cache from Root Coordinator. At the same time, in order to keep the consistency of cache, when there are any changes |
|||
of meta information in Root Coordinator, it will inform all Proxies to clear the related meta cache, and any newer |
|||
requests will get the latest meta information. |
|||
|
|||
For inserts to a collection that is auto_id configured in the collection schema, Proxy assigns a primary key for |
|||
every row of insert requests. For now, the only supported data type of auto-generated primary field is `int64`. Proxy |
|||
applies for a batch of primary keys from Root Coordinator and caches them for local assignments. When the primary keys in the cache |
|||
are not enough, Proxy will continue to apply for another batch of primary keys. |
|||
|
|||
Proxy forwards ReleaseCollection and ReleasePartition to Query Coordinator, Query Coordinator then informs Root |
|||
Coordinator the events. After that, Root Coordinator will inform all Proxies to close the search-related and |
|||
search-result-related message streams. |
|||
|
|||
#### 6.2 Interaction with MsgStream |
|||
|
|||
In Milvus 2.0, the log broker serves as the system 'backbone': All data insert and update operations must go through the |
|||
log broker, and worker nodes execute CRUD operations by subscribing to and consuming logs. This design reduces system |
|||
complexity by moving core functions such as data persistence and flashback down to the storage layer, and log pub-sub |
|||
make the system even more flexible and better positioned for future scaling. |
|||
|
|||
So, we should configure a group of Virtual Channels for every collection. Now every virtual channel has a unique related |
|||
physical channel. Take pulsar as an example, the physical channel mentioned here means the topic of pulsar. |
|||
|
|||
For DmRequest, data will be written to DmChannels, while for DqRequest, requests will be written to DqRequestChannel, |
|||
and the corresponding results of query requests will be written to DqResultChannel. |
|||
|
|||
As the number of tables increases, the number of DmChannels increases on demand, and the number of physical channels |
|||
also increases on demand. In the future, the number of physical channels in the system can also be limited to a fixed |
|||
number, such as 1024. In this case, the same physical channel will be mapped to virtual channels of different |
|||
collections. Shown in the figure below. |
|||
|
|||
 |
|||
|
|||
When a collection is created, Root Coordinator need to decide the number of its DmChannels and the physical |
|||
channels mapped by each virtual channel, and persist these kinds of information as the meta information of the collection; |
|||
In addition, when the system finds that the collection receives DmRequest frequently, we can allocate more virtual channels |
|||
to the collection to increase the parallelism and thus increase the system throughput. This function is a key point of |
|||
future work. |
|||
|
|||
For DqRequest, request and result data are written to the stream. The request data will be written to DqRequestChannel, |
|||
and the result data will be written to DqResultChannel. Proxy will write the request of the collection into the |
|||
DqRequestChannel, and the DqRequestChannel will be jointly subscribed by a group of query nodes. When all query nodes |
|||
receive the DqRequest, they will write the query results into the DqResultChannel corresponding to the collection. As |
|||
the consumer of the DqResultChannel, Proxy is responsible for collecting the query results and aggregating them, |
|||
The result is then returned to the client. |
|||
|
|||
The allocation logic of DqRequestChannel and DqResultChannel of a collection is allocated by the Query Coordinator. Proxy needs to ask the Query Coordinator for the names of DqRequestChannel and DqResultChannel of a collection. |
|||
DqRequestChannel and DqResultChannel do not need to be persisted and can be freely allocated by Query Coordinator. In |
|||
the actual implementation, the DqRequestChannel of each collection can be exclusive, and the DqResultChannel can be |
|||
exclusive or shared by all collections on Proxy. When Proxy applies for the DqRequestChannel and DqResultChannel |
|||
information of the collection from the Query Coordinator, it can attach Proxy's own ID: ProxyID. |
|||
|
|||
With DqRequestChannel of the collection, Proxy will create a msgstream object to generate data into |
|||
DqRequestChannel. With the DqResultChannel of the collection, Proxy will create a msgstream object, and Proxy will |
|||
consume the data in the DqResultChannel. When these msgstream objects are closed, messages cannot be written to or |
|||
consumed from them. |
|||
|
|||
 |
|||
|
|||
#### 6.3 Interaction with Data Coordinator |
|||
|
|||
In Milvus, segment is the basic read-write and search unit of data. After consuming the data in DmChannel, the data |
|||
nodes will store the data in the object storage in the unit of segment (in the actual implementation, the segment will |
|||
be divided into multiple small files for writing). In Milvus, segment is uniquely identified by segment ID, and the |
|||
allocation logic of segment ID is the responsibility of the Data Coordinator. The SegmentID to which each row of data is |
|||
written needs to be determined before writing to DmChannel. For a write operation, Proxy will hash each row of data |
|||
again according to the hash value of its primary key, and then determine into which DmChannel each row of data enters. After |
|||
collecting the number of pieces to be written by each DmChannel, it applies to the data coordinator for which SegmentIDs the |
|||
newly written data of these dmchannels belong to. In the specific implementation, Proxy needs to preallocate some |
|||
quotas to the Data Coordinator to avoid frequent direct GRPC communication with the Data Coordinator. |
|||
|
|||
One consideration for uniformly assigning SegmentIDs by Data Coordinator is that Data Coordinator is responsible for |
|||
coordinating the total number of each segment not to be too large, and the location is near a water level, so that the |
|||
size of the segment is limited to a certain range. |
|||
|
|||
Other interactions between Proxy and Data Coordinator are mainly reflected in Proxy querying Data Coordinator for |
|||
the status and statistical information of the segment of the collection. LoadCollection is an example. The |
|||
synchronization semantics of the current LoadCollection needs to know the number of rows currently persisted, so the |
|||
Proxy needs to ask the Data Coordinator for the total number of rows currently persisted. |
|||
|
|||
#### 6.4 Interaction with Query Coordinator |
|||
|
|||
For LoadCollection, LoadPartition, ReleaseCollection, ReleasePartition requests, Proxy directly forwards these |
|||
requests to Query Coordinator for execution after checking and preprocessing these requests. When Proxy receives |
|||
feedback from Query Coordinator, it returns the feedback results to the clients. |
|||
|
|||
The semantics of the Load operation is to load Collection or Partition from persistent storage into the memory of Query |
|||
Nodes, or import streaming data into QueryNode so that it can be queried. If the load operation is not performed, the |
|||
query operation on the Collection or Partition cannot be performed. For the Load operation, Query Coordinator is |
|||
responsible for allocating DmChannels to different queryNodes and subscribing to them, and is responsible for receiving |
|||
those stream data. QueryCoordinator also allocates and loads the segments that have been persisted in storage in |
|||
Query Nodes. |
|||
|
|||
The semantics of the Release operation is the reverse operation of the Load operation, and the function is to unload the |
|||
data of the Collection or Partition from the memory. For Release operations, Query Coordinator is responsible for |
|||
notifying query nodes to unload the corresponding Collection or Partition in memory, and then sending the |
|||
ReleaseDqlMessageStream command to Root Coordinator, and Root Coordinator is responsible for broadcasting the |
|||
ReleaseDqlMessageStream command to all Proxies, so that all related streams used to send search requests and receive |
|||
search result in Proxy will be closed. |
|||
|
|||
The other interaction between Proxy and Query Coordinator is that Proxy needs to query from Query Coordinator for statistics |
|||
about Collection, Partition, and Segment. Taking ShowCollections as an example, if the ShowCollections parameter |
|||
specifies that the query is for Collections that have been loaded into memory, the ShowCollection request will be |
|||
forwarded to QueryCoordinator, and QueryCoordinator will return a list of all the recorded Collections loaded into |
|||
memory. Taking LoadCollection as another example, its synchronization semantics is that the number of rows loaded in |
|||
the memory must be no less than the number of rows that have been persisted. This requires Proxy to ask the Query |
|||
Coordinator for the sum of the number of rows currently loaded into the query nodes in the Collection. |
|||
|
|||
#### 6.5 Decouple Functionality and Communication |
|||
|
|||
 |
|||
|
|||
As shown in the figure above, there are interactions between various types of components in the Milvus2.0 system. The |
|||
implementation of these interactions will vary according to the deployment of Milvus. Milvus Standalone allows the |
|||
various components of Milvus as a whole independent process to be deployed on a single node. Milvus Cluster distributes |
|||
all components on multiple nodes. In Milvus Standalone, the interaction between components can be parameter transfer |
|||
between functions or communication between Grpc. The log system can be either Pulsar or RocksDb. In Milvus Cluster, the |
|||
communication between components is mostly undertaken by grpc, and the message flow is mostly by Pulsar. |
|||
|
|||
Therefore, in the original design, Milvus 2.0 decoupled the core function of the component and the communication between |
|||
components. Taking Proxy as an example, the core function of Proxy component is determined and has nothing to do |
|||
with the deployment form. In the project's internal/proxy directory, it contains the functions of the core components of |
|||
Proxy; and internal/distributed/proxy contain the core functions of Proxy in the deployment of cluster distributed |
|||
which contains the re-encapsulation and communication implementation of Proxy. The following article will mainly |
|||
introduce the functions of Proxy core layer. |
|||
|
|||
#### 6.6 Core Components of Proxy |
|||
|
|||
Proxy is mainly composed of four modules: taskScheduler, channelsMgr, channelsTimeTicker, globalMetaCache. taskScheduler |
|||
is responsible for task scheduling; channelsMgr is responsible for the management of DmChannels, DqRequestChannel, |
|||
DqResultChannel and corresponding MsgStream objects of each Collection; channelsTimeTicker is responsible for collecting |
|||
the timestamp information of all physical Channels regularly; globalMetaCache is responsible for caching the metadata of |
|||
Collection and Partition. |
|||
|
|||
##### 6.6.1 taskScheduler |
|||
|
|||
There are three main functions in taskScheduler: |
|||
|
|||
- Schedule task |
|||
- Maintain the snapshot of timestamp statistics |
|||
- Receive the search results from all streams and then distribute them to related task |
|||
|
|||
taskScheduler maintains three queues: ddQueue, dmQueue and dqQueue correspond to DdRequest, DmRequest, and DqRequest |
|||
respectively. The interface of taskQueue is defined as follows: |
|||
|
|||
```go |
|||
type taskQueue interface { |
|||
utChan() <-chan int |
|||
utEmpty() bool |
|||
utFull() bool |
|||
addUnissuedTask(t task) error |
|||
FrontUnissuedTask() task |
|||
PopUnissuedTask() task |
|||
AddActiveTask(t task) |
|||
PopActiveTask(tID UniqueID) task |
|||
getTaskByReqID(reqID UniqueID) task |
|||
TaskDoneTest(ts Timestamp) bool |
|||
Enqueue(t task) error |
|||
setMaxTaskNum(num int64) |
|||
getMaxTaskNum() int64 |
|||
} |
|||
``` |
|||
|
|||
Proxy encapsulates each request with a corresponding task object. Each task object implements the task interface. The |
|||
definition of the task interface is as follows: |
|||
|
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
``` |
|||
|
|||
Each specific task object must implement the interface defined by the task. |
|||
|
|||
The key members of taskQueue are unissuedTasks of type List and activateTasks of type maps. Among them, unissuedTasks |
|||
contain all tasks that have not been scheduled, and activateTasks contain all tasks that are being scheduled. |
|||
|
|||
When the external caller of taskScheduler stuffs the task into the corresponding taskQueue, it will call the OnEnqueue |
|||
interface of the task. When OnEnqueue is called, the SetID of the task will be called to assign the taskID to the task. |
|||
The taskID is globally unique and is used to identify the task. OnEnqueue will also call task.SetTs to set the timestamp |
|||
for the task. It can be seen that the timestamp of entering the queue must be greater than the timestamp that already |
|||
exists in the queue, and it will also be greater than the timestamp of the task that exists in activateTask. At the end |
|||
of the task's OnEnqueue, call the taskQueue's addUnissuedTask to add the task to the unissuedTasks. When OnEnqueue is |
|||
executed, the external caller of taskScheduler calls WaitToFinish of the task to synchronously block and wait for the |
|||
execution of the task to be done. |
|||
|
|||
When taskScheduler's background scheduling coroutine decides to schedule a task, it will call the taskQueue's |
|||
PopUnissuedTask to remove a task from unissuedTasks, and then call the taskQueue's AddActivateTask to put the task in |
|||
the activateTasks Map. Then perform operations on the task. In the execution process of the task, its PreExecute, |
|||
Execute, and PostExecute interfaces will be called in sequence. If an exception occurs in a certain step, the error will |
|||
be returned in advance and the subsequent steps will be skipped. Whether it is a successful execution or an error, the |
|||
Notify method will be called, this method will wake up the coroutine that is blocking and waiting for the Task. When the |
|||
task is executed, the PopActivateTask of the taskQueue is called to take the task out of activateTasks. |
|||
|
|||
The following figure is a schematic diagram of taskScheduler's scheduling of DdQueue. |
|||
|
|||
For the task of taskScheduler in DdQueue, it must be scheduled serially, the task that enters the queue first is |
|||
executed, and at most only one task is executing. After the task at the head of the queue is executed, other tasks in |
|||
the queue can be scheduled. |
|||
|
|||
 |
|||
|
|||
The following figure is a schematic diagram of taskScheduler's scheduling of DmQueue. |
|||
|
|||
The tasks in DmQueue can be scheduled in parallel. In a scheduling process, taskScheduler will execute several tasks |
|||
from each task concurrently. |
|||
|
|||
 |
|||
|
|||
The following figure is a schematic diagram of taskScheduler's scheduling of DqQueue. |
|||
|
|||
 |
|||
|
|||
The tasks in DqQueue can be scheduled in parallel. In a scheduling process, taskScheduler will execute several tasks |
|||
concurrently. |
|||
|
|||
In order to facilitate the channelsTimeTicker component to obtain the synchronization point information corresponding to all |
|||
DmChannels, the taskScheduler needs to maintain a copy of the time statistics of the physical channels of all currently |
|||
unexecuted and executing tasks in the DmQueue. The member pChanSatisticsInfos is a map containing the mapping from pChan |
|||
to pChanStatInfo pointers. Among them, pChan is an alias of string, and pChanStatInfo is a custom structure, defined as |
|||
follows: |
|||
|
|||
```go |
|||
type Timestamp = uint64 |
|||
type pChan = string |
|||
|
|||
type pChanStatInfo struct { |
|||
maxTs Timestamp |
|||
minTs Timestamp |
|||
tsSet map[Timestamp] struct{} |
|||
} |
|||
``` |
|||
|
|||
pChan represents the name of the physical channel corresponding to the DmChannel. The pChanStatInfo structure contains 3 |
|||
members minTs, maxTs and tsSet. minTs and maxTs respectively represent the minimum and maximum timestamps of all tasks |
|||
related to the pChan in the current DmQueue. tsSet represents the set of timestamps of all tasks in DmQueue. When the |
|||
task enters the DmQueue, it will call the addPChanStats method of the queue, add the task's own timestamp to the |
|||
collection of pChanStatInfo's tsSet, and use the task's own maxTs to recalculate the pChanStatInfo's maxTs. The |
|||
calculation method is if its own maxTs is greater than pChanStatInfo MaxTs in pChanStatInfo, then update maxTs in |
|||
pChanStatInfo. Since the newly added timestamp is definitely greater than the existing timestamp, there is no need to |
|||
update minTs. When the PopActivateTask interface of the Queue is called to take the task out of the DmTaskQueue, the |
|||
popPChanStats interface of the Queue is called to delete the task timestamp from the timestamp set tsSet of |
|||
pChanStatInfo, and recalculate the minimum timestamp minTs. |
|||
|
|||
DmQueue maintains the timestamp statistics of pChans and provides the method getPChanStatsInfo to the caller. |
|||
pChanStatistics is defined as follows: |
|||
|
|||
```go |
|||
type pChanStatistics struct { |
|||
minTs Timestamp |
|||
maxTs Timestamp |
|||
} |
|||
|
|||
func (queue *DmTaskQueue) getPChanStatsInfo() (map[pChan]*pChanStatistics, error) |
|||
``` |
|||
|
|||
The return value of this method includes a pChan to pChanStatistics pointer mapping. pChanStatistics includes two |
|||
members minTs and maxTs, which respectively represent the minimum and maximum timestamps of all tasks that have not been |
|||
completed on pChan. The channelsTimeTicker collects timestamp information mainly depends on this method. The awakened |
|||
coroutine reduces the timestamp result and sends the final timestamp result back to the RootCoord. |
|||
|
|||
taskScheduler is also responsible for collecting the results of search requests. For each search request, when Proxy |
|||
writes the request to the DqRequestChannel, it will attach the ReqID, and the query nodes will also bring the ReqID back |
|||
when writing the search result back to the DqResultChannel. taskScheduler will start a background coroutine to consume |
|||
search results from DqResultChannel, and then distribute messages according to the ReqID in it. When several results of |
|||
the same ReqID are collected and the termination condition is reached, these results will be passed to the blocking task |
|||
coroutine which is waiting. The waken task will reduce the search results and then send the final search result to |
|||
clients. |
|||
|
|||
##### 6.6.2 channelsMgr |
|||
|
|||
channelsMgr is responsible for the management of DmChannels, DqRequestChannel, DqResultChannel and corresponding |
|||
MsgStream objects of each Collection. The interface is defined as follows: |
|||
|
|||
```go |
|||
type channelsMgr interface { |
|||
getChannels(collectionID UniqueID) ([]pChan, error) |
|||
getVChannels(collectionID UniqueID) ([]vChan, error) |
|||
createDQLStream(collectionID UniqueID) error |
|||
getDQLStream(collectionID UniqueID) (msgstream.MsgStream, error) |
|||
removeDQLStream(collectionID UniqueID) error |
|||
removeAllDQLStream() error |
|||
createDMLMsgStream(collectionID UniqueID) error |
|||
getDMLStream(collectionID UniqueID) (msgstream.MsgStream, error) |
|||
removeDMLStream(collectionID UniqueID) error |
|||
removeAllDMLStream() error |
|||
} |
|||
``` |
|||
|
|||
- getChannels and getVChannels |
|||
|
|||
getVChannels returns a list that represents all virtual DmChannels of collection, getChannels returns a list that |
|||
represents all physical DmChannels of collection. The two lists correspond one-to-one according to a position. |
|||
|
|||
- createDMLStream and getDMLStream |
|||
|
|||
createDMLStream creates the dml message stream of a collection; |
|||
|
|||
getDMLStream returns the dml message stream of a collection; |
|||
|
|||
Proxy uses these dml message streams to write dml data, such as insert requests. |
|||
|
|||
- createDQLStream and getDQLStream |
|||
|
|||
createDQLStream creates the dql message stream of a collection; |
|||
|
|||
getDQLStream returns the dql message stream of a collection; |
|||
|
|||
Proxy uses these dql message streams to send search requests. |
|||
|
|||
The Remove related operation is to delete the corresponding message stream object, but the stream is not immediately |
|||
closed because maybe there are some data that needs to be written into stream currently. |
|||
|
|||
##### 6.6.3 channelsTimeTicker |
|||
|
|||
channelsTimeTicker is responsible for regularly collecting the synchronization timestamp information of all physical channels. |
|||
|
|||
```go |
|||
type channelsTimeTicker interface { |
|||
start() error |
|||
close() error |
|||
addPChan(pchan pChan) error |
|||
removePChan(pchan pChan) error |
|||
getLastTick(pchan pChan) (Timestamp, error) |
|||
getMinTsStatistics() (map[pChan]Timestamp, error) |
|||
} |
|||
``` |
|||
|
|||
- addPChan and removePChan |
|||
|
|||
addPChan adds a physical channel to channelsTimeTicker, channelsTimeTicker only sends the information of existing |
|||
pChans to Root Coordinator; |
|||
|
|||
- getMinTsStatistics |
|||
|
|||
getMinTsStatistics returns the timestamp statistics of physical channels, there is a background coroutine in Proxy |
|||
that call getMinTsStatistics periodically and then send the timestamp statistics to Root Coordinator; |
|||
|
|||
- getLastTick |
|||
|
|||
getLastTick returns the minimum timestamp which has already been synchronized of a physical channel; |
|||
|
|||
channelsTimeTicker will maintain the map minTsStatistics that can be synchronized and the map currents that will be |
|||
synchronized. They are all mappings from pChan to Timestamp. The channelsTimeTicker itself has a background coroutine, |
|||
which periodically calls getPChanStatsInfo of DmQueue to obtain the minimum and maximum timestamp information pChanStats |
|||
of all unfinished tasks. channelsTimeTicker will also request a timestamp from Root Coordinator as now. |
|||
channelsTimeTicker updates minTsStatistics and currents according to the relationship between now, minTsStatistics, |
|||
currents, and pChanStats. The specific algorithm is as follows: |
|||
|
|||
```go |
|||
now = get a timestamp from RootCoordinator |
|||
For each pChan in currents, do |
|||
current = currents[pChan] |
|||
if pChan not exists in pChanStats, then |
|||
minTsStatistics[pChan] = current |
|||
currents[pChan] = now |
|||
else: |
|||
minTs = pChanStats[pChan].minTs |
|||
maxTs = pChanStats[pChan].maxTs |
|||
if minTs > current,then |
|||
minTsStatistics[pChan] = min(minTs, now) |
|||
next:= min(now + sendTimeTickMsgInterval, maxTs) |
|||
currents[pChan] = next |
|||
``` |
|||
|
|||
##### 6.6.4 globalMetaCache |
|||
|
|||
globalMetaCache is responsible for caching the meta-information of Collection and Partition. These meta-information |
|||
include CollectionID, PartitionID, CollectionSchema, etc. |
|||
|
|||
The interface of Cache is defined as follows: |
|||
|
|||
```go |
|||
type Cache interface { |
|||
GetCollectionID(ctx context.Context, collectionName string) (typeutil.UniqueID, error) |
|||
GetPartitionID(ctx context.Context, collectionName string, partitionName string) (typeutil.UniqueID, error) |
|||
GetPartitions(ctx context.Context, collectionName string) (map[string]typeutil.UniqueID, error) |
|||
GetCollectionSchema(ctx context.Context, collectionName string) (*schemapb.CollectionSchema, error) |
|||
RemoveCollection(ctx context.Context, collectionName string) |
|||
RemovePartition(ctx context.Context, collectionName string, partitionName string) |
|||
} |
|||
``` |
|||
|
|||
- getCollectionID |
|||
|
|||
GetCollectionID returns the collection ID of collection; |
|||
|
|||
- GetPartitions |
|||
|
|||
GetPartitions returns a mapping which maps all partition names to partition IDs of collection; |
|||
|
|||
- GetPartition |
|||
|
|||
GetPartition returns the partition ID of a specific partition and collection; |
|||
|
|||
- GetCollectionSchema |
|||
|
|||
GetCollectionSchema returns the schema of collection; |
|||
|
|||
- RemoveCollection |
|||
|
|||
RemoveCollection removes the meta information of collection; |
|||
|
|||
- RemovePartition |
|||
|
|||
RemovePartition removes the meta information of partition; |
@ -0,0 +1,94 @@ |
|||
```haskell |
|||
Expr := |
|||
LogicalExpr | NIL |
|||
|
|||
LogicalExpr := |
|||
LogicalExpr BinaryLogicalOp LogicalExpr |
|||
| UnaryLogicalOp LogicalExpr |
|||
| "(" LogicalExpr ")" |
|||
| SingleExpr |
|||
|
|||
BinaryLogicalOp := |
|||
"&&" | "and" |
|||
| "||" | "or" |
|||
|
|||
UnaryLogicalOp := |
|||
"not" |
|||
|
|||
SingleExpr := |
|||
TermExpr |
|||
| CompareExpr |
|||
|
|||
TermExpr := |
|||
IDENTIFIER "in" ConstantArray |
|||
|
|||
ConstantArray := |
|||
"[" ConstantExpr { "," ConstantExpr } "]" |
|||
|
|||
ConstantExpr := |
|||
Constant |
|||
| ConstantExpr BinaryArithOp ConstantExpr |
|||
| UnaryArithOp ConstantExpr |
|||
|
|||
Constant := |
|||
INTEGER |
|||
| FLOAT_NUMBER |
|||
|
|||
UnaryArithOp := |
|||
"+" |
|||
| "-" |
|||
|
|||
BinaryArithOp := |
|||
"+" |
|||
| "-" |
|||
| "*" |
|||
| "/" |
|||
| "%" |
|||
| "**" |
|||
|
|||
CompareExpr := |
|||
IDENTIFIER CmpOp IDENTIFIER |
|||
| IDENTIFIER CmpOp ConstantExpr |
|||
| ConstantExpr CmpOp IDENTIFIER |
|||
| ConstantExpr CmpOpRestricted IDENTIFIER CmpOpRestricted ConstantExpr |
|||
|
|||
CmpOpRestricted := |
|||
"<" |
|||
| "<=" |
|||
|
|||
CmpOp := |
|||
">" |
|||
| ">=" |
|||
| "<" |
|||
| "<=" |
|||
| "==" |
|||
| "!=" |
|||
|
|||
INTEGER := 整数 |
|||
FLOAT_NUM := 浮点数 |
|||
IDENTIFIER := 列名 |
|||
``` |
|||
|
|||
Tips: |
|||
|
|||
1. NIL represents an empty string, which means there is no Predicate for Expr. |
|||
2. Gramma is described by EBNF syntax, expressions that may be omitted or repeated are represented through curly braces `{...}`. |
|||
|
|||
After syntax analysis, the following rules will be applied: |
|||
|
|||
1. Non-vector column must exist in Schema. |
|||
2. CompareExpr/TermExpr requires operand type matching. |
|||
3. CompareExpr between non-vector columns of different types is available. |
|||
4. The modulo operation requires all operands to be integers. |
|||
5. Integer columns can only match integer operands. While float columns can match both integer and float operands. |
|||
6. In BinaryOp, the `and`/`&&` operator has a higher priority than the `or`/`||` operator. |
|||
|
|||
Example: |
|||
|
|||
```python |
|||
A > 3 && A < 4 && (C > 5 || D < 6) |
|||
1 < A <= 2.0 + 3 - 4 * 5 / 6 % 7 ** 8 |
|||
A == B |
|||
FloatCol in [1.0, 2, 3.0] |
|||
Int64Col in [1, 2, 3] or C != 6 |
|||
``` |
@ -0,0 +1,98 @@ |
|||
## Root Coordinator recovery on power failure |
|||
|
|||
## 1. Basic idea |
|||
|
|||
1. `RootCoord` (Root Coordinator) reads meta from etcd when it starts. |
|||
2. `RootCoord` needs to store the `position` of the msgstream into etcd every time it consumes the msgstream. |
|||
3. `RootCoord` reads the `position` of msgstream from etcd when it starts up, then it seeks to the specified `position` and re-consumes the msgstream. |
|||
4. Ensure that all messages from the msgstream are processed in an idempotent fashion, so that repeated consumption of the same message does not cause system inconsistencies. |
|||
5. `RootCoord` registers itself in etcd and finds out if the dependent `DataCoord(Data Coordinator)` and `IndexCoord(Index Coordinator)` are online via etcd. |
|||
|
|||
## 2. Specific tasks |
|||
|
|||
### 2.1 Read meta from etcd |
|||
|
|||
1. `RootCoord` needs to load meta from etcd when it starts, this part is already done. |
|||
|
|||
### 2.2 `dd requests` from grpc |
|||
|
|||
1. The `dd requests`, such as create_collection, create_partition, etc., from grpc are marked as done only if the related meta has been written into etcd. |
|||
2. The `dd requests` should be sent to `dd msgstream` when the operation is done. |
|||
3. There may be a fault here, that is, the `dd request` has been written to etcd, but it has not been sent to `dd msgstream` yet, then the `RootCoord` has crashed. |
|||
4. For the scenarios mentioned in item 3, `RootCoord` needs to check if all `dd requests` are sent to `dd msgstream` when it starts up. |
|||
5. `RootCoord`'s built-in scheduler ensures that all grpc requests are executed serially, so it only needs to check whether the most recent `dd requests` are sent to the `dd msgstream`, and resend them if not. |
|||
6. Take `create_collection` as an example to illustrate the process |
|||
- When `create collection` is written to etcd, 2 additional keys are updated, `dd_msg` and `dd_type`. |
|||
- `dd_msg` is the serialization of the `dd_msg`. |
|||
- `dd_type` is the message type of `dd_msg`, such as `create_collection`, `create_partition`, `drop_collection,` etc. It's used to deserialize `dd_msg`. |
|||
- Update the meta of `create_collection`, `dd_msg` and `dd_type` at the same time in a transactional manner. |
|||
- When `dd_msg` has been sent to `dd msgstream`, delete `dd_msg` and `dd_type` from etcd. |
|||
- When the `RootCoord` starts, first check whether there are `dd_msg` and `dd_type` in etcd. If yes, then deserialize `dd_msg` according to `dd_type`, and then send it to the `dd msgstream`. Otherwise, no processing will be done. |
|||
- There may be a failure here, that is, `dd_msg` has been sent to the `dd msgstream` , but has not been deleted from etcd yet, then the `RootCoord` crashed. In this case, the `dd_msg` would be sent to `dd msgstream` repeatedly, so the receiver needs to count this case. |
|||
|
|||
### 2.3 `create index` requests from grpc |
|||
|
|||
1. In the processing of `create index`, `RootCoord` calls `metaTable`'s `GetNotIndexedSegments` to get all segment ids that are not indexed. |
|||
2. After getting the segment ids, `RootCoord` calls `IndexCoord` to create index on these segment ids. |
|||
3. In the current implementation, the `create index` requests will return after the segment ids are put into a go channel. |
|||
4. The `RootCoord` starts a background task that keeps reading the segment ids from the go channel, and then calls the `IndexCoord` to create the index. |
|||
5. There is a fault here, the segment ids have been put into the go channel in the processing function of the grpc request, and then the grpc returns, but the `RootCoord`'s background task has not yet read them from the go channel, then `RootCoord` crashes. At this time, the client thinks that the index is created, but the `RootCoord` does not call `IndexCoord` to create the index. |
|||
6. The solution for the fault mentioned in item 5: |
|||
- Remove the go channel and `RootCoord`'s background task. |
|||
- In the request processing function of `create index`, the call will return only when all segment ids have been sent `IndexCoord`. |
|||
- Some segment ids may be sent to `IndexCoord` repeatedly, and `IndexCoord` needs to handle such requests. |
|||
|
|||
### 2.4 New segment from `DataCoord` |
|||
|
|||
1. Each time a new segment is created, the `DataCoord` sends the segment id to the `RootCoord` via msgstream. |
|||
2. `RootCoord` needs to update the segment id to the collection meta and record the position of the msgstream in etcd. |
|||
3. Step 2 is transactional and the operation will be successful only if the collection meta in etcd is updated. |
|||
4. So the `RootCoord` only needs to restore the msgstream to the position when recovering from a power failure. |
|||
|
|||
### 2.5 Flushed segment from `data node` |
|||
|
|||
1. Each time the `DataNode` finishes flushing a segment, it sends the segment id to the `RootCoord` via msgstream. |
|||
2. `RootCoord` needs to fetch binlog from `DataCoord` by id and send a request to `IndexCoord` to create an index on this segment. |
|||
3. When the `IndexCoord` is called successfully, it will return a build id, and then `RootCoord` will update the build id to the `collection meta` and record the position of the msgstream in etcd. |
|||
4. Step 3 is transactional and the operation will be successful only if the `collection meta` in etcd is updated. |
|||
5. So the `RootCoord` only needs to restore the msgstream to the position when recovering from a power failure. |
|||
|
|||
### 2.6 Failed to call external grpc service |
|||
|
|||
1. `RootCoord` depends on `DataCoord` and `IndexCoord`, if the grpc call failed, it needs to reconnect. |
|||
2. `RootCoord` does not listen to the status of the `DataCoord` and `IndexCoord` in real time. |
|||
|
|||
### 2.7 Add virtual channel assignment when creating a collection |
|||
|
|||
1. Add a new field, "number of shards" in the `create collection` request. The "num of shards" tells the `RootCoord` to create the number of virtual channels for this collection. |
|||
2. In the current implementation, virtual channels and physical channels have a one-to-one relationship, and the total number of physical channels increases as the number of virtual channels increases; later, the total number of physical channels needs to be fixed, and multiple virtual channels share one physical channel. |
|||
3. The name of the virtual channel is globally unique, and the `collection meta` records the correspondence between the virtual channel and the physical channel. |
|||
|
|||
### Add processing of time synchronization signals from Proxy node |
|||
|
|||
1. A virtual channel can be inserted by multiple proxies, so the timestamp in the virtual channel does not increase monotonically. |
|||
2. All proxies report the timestamp of all the virtual channels to the `RootCoord` periodically. |
|||
3. The `RootCoord` collects the timestamps from the proxies on each virtual channel and gets the minimum one as the timestamp of that virtual channel, and then inserts the timestamp into the virtual channel. |
|||
4. Proxy reports the timestamp to the `RootCoord` via grpc. |
|||
5. Proxy needs to register itself in etcd when it starts, `RootCoord` will listen to the corresponding key to determine how many active proxies there are, and thus determine if all of them have sent timestamps to `RootCoord`. |
|||
6. If a proxy is not registered in etcd but sends a timestamp or any other grpc request to `RootCoord`, `RootCoord` will ignore the grpc request. |
|||
|
|||
### 2.9 Register service in etcd |
|||
|
|||
1. `RootCoord` needs to register itself with etcd when it starts. |
|||
2. The registration should include IP address, port, its own id and global incremental timestamp. |
|||
|
|||
### 2.10 Remove the code related to Proxy service |
|||
|
|||
1. `Proxy service` related code will be removed. |
|||
2. The job of time synchronization which is done by `Proxy service` is partially simplified and handed over to the `RootCoord` (subsection 2.8). |
|||
|
|||
### 2.11 Query collection meta based on timeline |
|||
|
|||
1. Add a new field of `timestamp` to the grpc request of `describe collection`. |
|||
2. `RootCoord` should provide snapshot on the `collection mate`. |
|||
3. Return the `collection meta` at the point of timestamp mentioned in the request. |
|||
|
|||
### 2.12 Timestamp of `dd operations` |
|||
|
|||
1. `RootCoord` response is to set the timestamp of `dd operations`, create collection, create partition, drop collection, drop partition, and send this timestamp into `dml msgstream`. |
@ -0,0 +1,54 @@ |
|||
# MEP: Dynamic Configuration |
|||
|
|||
Current state: "Accepted" |
|||
|
|||
ISSUE: https://github.com/milvus-io/milvus/issues/18300 |
|||
|
|||
Keywords: config etcd |
|||
|
|||
Released: 2.3.0 |
|||
|
|||
## Summary(required) |
|||
|
|||
At present, there are numerous configurations in Milvus that require a restart of Milvus to take effect. This can interrupt service in production environments and is not friendly to operations and maintenance. In this MEP, a solution for dynamically updating configurations will be provided so that users can make configuration changes without restarting the cluster. |
|||
|
|||
## Motivation(required) |
|||
|
|||
Ability to dynamically modify configurations and expose current configuration information through API, simplifying operational complexity. |
|||
|
|||
## Public Interfaces(optional) |
|||
|
|||
No new public interfaces changed. |
|||
|
|||
## Design Details(required) |
|||
|
|||
### Goal |
|||
|
|||
1. Support multiple config sources, including Etcd, environment variables, and configuration files. On this basis, add watch events for changes in the Etcd config path and file changes. When a change event occurs, broadcast it to subscribers through an event handler. Subscribers can decide on subsequent logic based on this to achieve the requirement of dynamically modifying configurations. |
|||
2. Configuration priority: Etcd > Environment > milvus.yaml; higher-priority configurations override lower-priority ones. Even if a higher-priority configuration is deleted, lower-priority configurations can still be used. |
|||
3. To ensure compatibility, ignore case sensitivity and characters such as / . \_ when dealing with configuration item keys. |
|||
|
|||
 |
|||
ref: https://github.com/go-chassis/go-archaius/ |
|||
|
|||
No-Goal (Not in this release plan) |
|||
|
|||
1. Configuration grading |
|||
1. Node override config |
|||
1. Collection override config |
|||
1. Support for more cloud configs such as Consul, Zookeeper, etc. |
|||
|
|||
## Compatibility, Deprecation, and Migration Plan(optional) |
|||
|
|||
Compatible with old versions. |
|||
|
|||
## Test Plan(required) |
|||
|
|||
- Verify the ability to dynamically modify configurations in etcd. |
|||
- Verify the ability of helm and operator to deploy milvus.yaml and modify configurations. |
|||
- Verify the effectiveness of business operations after partially modifying dynamic configurations. |
|||
- Verify version compatibility under dynamic configuration. |
|||
|
|||
## Rejected Alternatives(optional) |
|||
|
|||
## References(optional) |
@ -0,0 +1,77 @@ |
|||
# MEP: Search By Primary Keys |
|||
|
|||
Current state: Under Discussion |
|||
|
|||
ISSUE: [[Feature]: Support to search by primary keys #23184](https://github.com/milvus-io/milvus/issues/23184) |
|||
|
|||
Keywords: Search, ANN |
|||
|
|||
Released: v2.3.0 |
|||
|
|||
## Summary |
|||
|
|||
Support to search (ANNS) by the query vectors corresponding to the given primary keys. |
|||
|
|||
## Motivation |
|||
|
|||
For now, Milvus requires passing the query vectors to do anns, we have to fetch the vectors first if they are in the collection, which is complex and slow. |
|||
|
|||
We need a way to do anns directly for the corresponding vectors of given primary keys, which should be more efficient. |
|||
|
|||
## Public Interfaces |
|||
|
|||
Add new field `primary_keys` in `SearchRequest`, all SDKs adds new method: |
|||
```golang |
|||
func SearchByPK( |
|||
ctx context.Context, |
|||
collName string, |
|||
partitions []string, |
|||
expr string, |
|||
outputFields []string, |
|||
primaryKeys []entity.PrimaryKey, |
|||
vectorField string, |
|||
metricType entity.MetricType, |
|||
topK int, |
|||
sp entity.SearchParam, |
|||
opts ...SearchQueryOptionFunc, |
|||
) ([]SearchResult, error) |
|||
``` |
|||
|
|||
## Design Details |
|||
|
|||
Proxy fetches the vectors by primary keys from QueryNodes first, and then search with these vectors. |
|||
|
|||
For better performance, we will add a new RPC interface for QueryNode: |
|||
```proto |
|||
rpc Fetch(FetchRequest) returns (FetchResponse) {} |
|||
``` |
|||
which only supports to fetch by primary keys, this will be more efficient than the present `Query` interface. |
|||
|
|||
|
|||
## Compatibility, Deprecation, and Migration Plan |
|||
|
|||
None |
|||
|
|||
## Test Plan |
|||
|
|||
### Unit Tests |
|||
|
|||
- Test for fetching in segcore |
|||
- Test for fetching in QueryNode |
|||
- Test for searching by primary keys in Proxy |
|||
|
|||
|
|||
### E2E Tests |
|||
| Test Cases | Expected Behavior | |
|||
| :------------------------------------------: | :--------------------------------: | |
|||
| search by non-existed primary keys | report error | |
|||
| search by existed primary keys | return topk results for each query | |
|||
| all present tests cases of search by vectors | the same | |
|||
|
|||
## Rejected Alternatives |
|||
|
|||
Implement this without adding the `Fetch` interface, using `Query` to retrieve the vectors and then search by vectors. |
|||
|
|||
## References |
|||
|
|||
None |
@ -0,0 +1,105 @@ |
|||
# MEP: Default Value |
|||
|
|||
Current state: Under Discussion |
|||
|
|||
ISSUE: [[Feature]: Support Default Value #23337](https://github.com/milvus-io/milvus/issues/23337) |
|||
|
|||
Keywords: Default, Insert, Upsert |
|||
|
|||
Released: v2.3.1 |
|||
|
|||
## Summary |
|||
|
|||
Support Default Value when input data. |
|||
|
|||
## Motivation |
|||
|
|||
For now, Milvus don't support Default function. If the user pass in the same data under a certain field schema, the data can only be passed in repeatedly, which is not so flexible and user-friendly。 |
|||
|
|||
We need a way to support Default function, which is more efficient. |
|||
|
|||
## Public Interfaces |
|||
|
|||
Add new field `default_value` in `FieldSchema` |
|||
```proto |
|||
message FieldSchema { |
|||
... |
|||
ScalarField default_value = 11; // default_value only support scalars for now |
|||
} |
|||
``` |
|||
|
|||
## Design Details |
|||
|
|||
1. Add the default_value in the field schema as an optional field. |
|||
|
|||
```proto |
|||
|
|||
message FieldSchema { |
|||
... |
|||
ScalarField default_value = 11; // default_value only support scalars for now |
|||
} |
|||
``` |
|||
|
|||
2. Will use the default_value if no data pass(the field get nil when insert and upsert). |
|||
|
|||
```proto |
|||
|
|||
message FieldData { |
|||
... |
|||
oneof field { |
|||
ScalarField scalars = 3; |
|||
VectorField vectors = 4; |
|||
} |
|||
} |
|||
``` |
|||
|
|||
```python |
|||
# create collection |
|||
nb = 3000 |
|||
fields = [ |
|||
FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), |
|||
# restrict at most one value to be passed in as the default value |
|||
FieldSchema(name="float", dtype=DataType.FLOAT, default_value=1.0) |
|||
] |
|||
schema = CollectionSchema( |
|||
fields=fields, description="collection") |
|||
|
|||
collection = Collection(name="hello_milvus", schema=default_schema) |
|||
|
|||
# insert data |
|||
collection.insert( |
|||
[ |
|||
[i for i in range(nb)], |
|||
# will use the default_value |
|||
[], |
|||
] |
|||
) |
|||
``` |
|||
## Compatibility, Deprecation, and Migration Plan |
|||
|
|||
| Test Cases | ExpectedBehavior | |
|||
|:-----------------------------------------: | :---------------------------------------: | |
|||
| schema built in 2.2.x | can be used normally in the new version | |
|||
|
|||
## Test Plan |
|||
|
|||
### Unit Tests |
|||
|
|||
- Test for using default value in proxy |
|||
|
|||
|
|||
### E2E Tests |
|||
| Test Cases | Expected Behavior | |
|||
| :------------------------------------------: | :--------------------------------------: | |
|||
| set illegal default value | report error | |
|||
| set legal default value | use default value as fields data | |
|||
| schema built in 2.2.x | can be used normally in the new version | |
|||
| don't set default value | the same | |
|||
|
|||
## Rejected Alternatives |
|||
|
|||
Default value is set by column, and the writing method of [1,2,3, {default}, {default}, 4, 5] is not supported. |
|||
|
|||
## References |
|||
|
|||
None |
@ -0,0 +1,271 @@ |
|||
# MEP: Refactor QueryNode v2 |
|||
|
|||
Current state: Merged |
|||
|
|||
ISSUE: [[Enhancement]: Refactor QueryNode #21624](https://github.com/milvus-io/milvus/issues/21624) |
|||
|
|||
Keywords: Search, ANN |
|||
|
|||
Released: v2.3.0 |
|||
|
|||
## Summary |
|||
|
|||
By refactoring querynode, we plan to achieve: |
|||
|
|||
- Separate "Delegator" and "Worker" |
|||
- Remove delta channel for deletion forwarding |
|||
- Maintain growing segments in distribution |
|||
- Improve the readability of the code |
|||
|
|||
## Delegator and Worker |
|||
|
|||
`Delegator`, aka `ShardLeader` in querynode v1, handles the segment distribution and consumes data from the dml channel. All the distribution changes(load&release) shall be forwarded by delegators so that they shall always have the latest workable segment distribution information for the shard. |
|||
|
|||
On the other hand, `Worker` serves as pure computing labor and provides search/query services on the segments on it. |
|||
|
|||
One querynode could be `Delegator` and `Worker` at the same time for now. After separating them into two sub packages, we could easily rearrange them into different components in the future if needed. |
|||
|
|||
### Interface Definition |
|||
|
|||
```Go |
|||
// ShardDelegator is the interface definition. |
|||
type ShardDelegator interface { |
|||
// Search & Query APIs |
|||
Search(ctx context.Context, req *querypb.SearchRequest) ([]*internalpb.SearchResults, error) |
|||
Query(ctx context.Context, req *querypb.QueryRequest) ([]*internalpb.RetrieveResults, error) |
|||
GetStatistics(ctx context.Context, req *querypb.GetStatisticsRequest) ([]*internalpb.GetStatisticsResponse, error) |
|||
|
|||
|
|||
// Distribution & dml related APIs |
|||
ProcessInsert(insertRecords map[int64]*InsertData) |
|||
ProcessDelete(deleteData []*DeleteData, ts uint64) |
|||
LoadGrowing(ctx context.Context, infos []*querypb.SegmentLoadInfo, version int64) error |
|||
LoadSegments(ctx context.Context, req *querypb.LoadSegmentsRequest) error |
|||
ReleaseSegments(ctx context.Context, req *querypb.ReleaseSegmentsRequest, force bool) error |
|||
SyncDistribution(ctx context.Context, entries ...SegmentEntry) |
|||
} |
|||
``` |
|||
```Go |
|||
// Worker is the interface definition for querynode worker role. |
|||
type Worker interface { |
|||
LoadSegments(context.Context, *querypb.LoadSegmentsRequest) error |
|||
ReleaseSegments(context.Context, *querypb.ReleaseSegmentsRequest) error |
|||
Delete(ctx context.Context, req *querypb.DeleteRequest) error |
|||
Search(ctx context.Context, req *querypb.SearchRequest) (*internalpb.SearchResults, error) |
|||
Query(ctx context.Context, req *querypb.QueryRequest) (*internalpb.RetrieveResults, error) |
|||
GetStatistics(ctx context.Context, req *querypb.GetStatisticsRequest) (*internalpb.GetStatisticsResponse, error) |
|||
|
|||
|
|||
IsHealthy() bool |
|||
Stop() |
|||
} |
|||
``` |
|||
|
|||
## Remove delta channel |
|||
|
|||
After supporting `Delete` operation in Milvus 2.0.x, delta channels are needed for forwarding delete operation to the querynodes on which there are no related DML channels. |
|||
|
|||
This mechanism makes the system require double the message queue topic compared to earlier Milvus version. Also, it couples the querynode search&query functionality with the forwarder of the delete records. Unfortunately, datanodes took this role, which may lead to search/query unavailability when some datanodes go down for some period of time. |
|||
|
|||
Naturally, the Delegator shall become the forwarder since it could consume all the dml data(including delete) from the message queue. There are some critical points that need to be designed carefully: |
|||
|
|||
- How to determine which segment/querynode shall be the target when forwarding the delete operations |
|||
- How to guarantee that all the segments have the whole picture of the deletion data |
|||
|
|||
### Primary Key Oracle(PKOracle) |
|||
|
|||
We need a component which could determine or estimate which segments might have the data with the provided pk value. Naming after PKOracle, it could be implemented in the following ways: |
|||
|
|||
- Delegator has all the PK column data |
|||
- Delegator has all the statslog(Bloom filter) files |
|||
- A third party component stores the PK value-segment ID mapping |
|||
|
|||
Since we implemented delete using BF before, it's the first choice to have option 2. |
|||
|
|||
<img alt="PK Oracle" src="./graphs/pk_oracle.png" width="600" /> |
|||
|
|||
Delete grpc def: |
|||
|
|||
```Go |
|||
Delete(context.Context, *querypb.DeleteRequest) (*commonpb.Status, error) |
|||
``` |
|||
``` Protobuf |
|||
message DeleteRequest { |
|||
common.MsgBase base = 1; |
|||
int64 collection_id = 2; |
|||
int64 partition_id = 3; |
|||
string vchannel_name = 4; |
|||
int64 segment_id = 5; |
|||
schema.IDs primary_keys = 6; |
|||
repeated uint64 timestamps = 7; |
|||
} |
|||
``` |
|||
|
|||
### Delete Forwarding Policy |
|||
|
|||
Delegators need to forward delete operation via grpc before any search/query operation can be executed. There are still several ways to forward the deletion data. |
|||
|
|||
- Forward the delete ASAP and blocks the consuming workflow if forwarding fails |
|||
- Forward the delete lazily. Which means deleting data could be forwarded in search or query request when needed with extra periodically "flush" jobs |
|||
- Forward the processed bitset only |
|||
|
|||
Policy 3 can not be done without delegators having all the primary key data. So with pre-determined BF PKOracle implementation, we need to choose between policy 1 & 2. |
|||
|
|||
After some investigation, it turned out to be that all the deletion records need to be applied strictly in the sequence order by its timestamp. Otherwise, the internal binary search may return wrong bitset for deletion. So Policy 1 became the only choice before we changed the segment inner implementation. |
|||
|
|||
### Data Integrity Guarantee |
|||
|
|||
Since Milvus2.x could be deployed as a distributed system, there are several cases that may damage data integrity |
|||
|
|||
- Load Asynchronizely |
|||
|
|||
In current design, there is no guarantee that all segments will be ready when delegators forward the deletion records while the collection is being loaded. |
|||
|
|||
- Load a new Segment |
|||
|
|||
A new segment might be loaded after the collection loaded due to some compaction might happen. If the consumed position is after the safe point (all delete operations before is synced to delta log), some delete entries might be missing during this procedure. |
|||
|
|||
- Balance, Node down or Rolling upgrade |
|||
|
|||
Similar to the previous case, when balancing segments, some deletion records might be missing as well. The same logic could apply to node down recovery and rolling upgrade. |
|||
|
|||
- Solution: Delete buffer with failure re-consume |
|||
|
|||
To solve the cases in which delete data might be lost, delegators will have a delete buffer to store "recent" delete data. So anytime a segment is loaded, the delegator will try to patch all the "needed" delete data from this buffer. |
|||
By "recent", it means a limited double buffer with configurable size. |
|||
And "needed" delete data means the delete records after the segment checkpoint. |
|||
If the segment checkpoint is beyond the delete buffer,the delegator will re-consume the delete data from the checkpoint as a last resort. |
|||
|
|||
## Other changes |
|||
|
|||
### Use pipeline instead of flowgraph |
|||
|
|||
Pipeline was a simplified flowgraph that every node could have one in-degree and one out-degree at most. |
|||
|
|||
Like an assembly line, pipeline splits a work that needs repeating over a period of time into many different parts. Every node was a single go routine work for one part of these. To improve running speed by improving parallelism. |
|||
|
|||
At querynode, pipeline was used to deal with msg from MsgStream, FilterNode filterates the invalid part in msg, Insert Node Insert rows to segment from msg,Delete Node Insert delete rows to segment from Msg and update TSafe. |
|||
|
|||
### Search/Query tsafe |
|||
|
|||
Since the only consumer is the delegator, the waiting tsafer logic is moved to delegator for now. |
|||
|
|||
## Interfaces |
|||
|
|||
### Manager |
|||
```Go |
|||
type CollectionManager interface { |
|||
// Get returns collection within a LRU cache, |
|||
// it will pull the collection from QueryCoord if it's not in the cache, |
|||
// returns error if failed to pull |
|||
Get(collectionID int64) (*Collection, error) |
|||
} |
|||
|
|||
type SegmentManager interface { |
|||
// Put puts the given segments in, |
|||
// and increases the ref count of the corresponding collection, |
|||
// dup segments will not increase the ref count |
|||
Put(segmentType SegmentType, segments ...*Segment) |
|||
Get(segmentID UniqueID) *Segment |
|||
GetSealed(segmentID UniqueID) *Segment |
|||
GetGrowing(segmentID UniqueID) *Segment |
|||
// Remove removes the given segment, |
|||
// and decreases the ref count of the corresponding collection, |
|||
// will not decrease the ref count if the given segment not exists |
|||
Remove(segmentID UniqueID, scope querypb.DataScope) |
|||
} |
|||
``` |
|||
|
|||
### Loader |
|||
```Go |
|||
type Loader interface { |
|||
// Load loads binlogs, and spawn segments, |
|||
// NOTE: make sure the ref count of the corresponding collection will never go down to 0 during this |
|||
Load(ctx context.Context, collectionID int64, segmentType SegmentType, version int64, infos ...*querypb.SegmentLoadInfo) ([]Segment, error) |
|||
} |
|||
``` |
|||
|
|||
### Segment |
|||
```Go |
|||
type Segment interface { |
|||
// Properties |
|||
ID() int64 |
|||
Collection() int64 |
|||
Partition() int64 |
|||
Channel() string |
|||
Version() int64 |
|||
StartPosition() *internalpb.MsgPosition |
|||
Type() SegmentType |
|||
|
|||
// Index related |
|||
AddIndex(fieldID int64, index *IndexedFieldInfo) |
|||
GetIndex(fieldID int64) *IndexedFieldInfo |
|||
HaveIndex(fieldID int64) bool |
|||
|
|||
// Insert related |
|||
Insert(entityIDs []int64, timestamps []Timestamp, record *segcorepb.InsertRecord) error |
|||
Delete(entityIDs []storage.PrimaryKey, timestamps []typeutil.Timestamp) error |
|||
|
|||
// Query related |
|||
Search(searchReq *searchRequest) (*SearchResult, error) |
|||
Retrieve(plan *RetrievePlan) (*segcorepb.RetrieveResults, error) |
|||
} |
|||
|
|||
func NewSegment(collection *Collection, |
|||
segmentID int64, |
|||
partitionID int64, |
|||
collectionID int64, |
|||
channel string, |
|||
segmentType SegmentType, |
|||
version int64, |
|||
startPosition *internalpb.MsgPosition) (*Segment, error) |
|||
func DeleteSegment(segment *Segment) |
|||
``` |
|||
|
|||
## Collection |
|||
```Go |
|||
type Collection struct { |
|||
} |
|||
|
|||
func (c *Collection) ID() UniqueID |
|||
func (c *Collection) Schema() *schemapb.CollectionSchema |
|||
func (c *Collection) GetPartitions() []int64 |
|||
func (c *Collection) HasPartition(partitionID int64) bool |
|||
func (c *Collection) AddPartition(partitionIDs ...int64) |
|||
func (c *Collection) RemovePartition(partitionID int64) |
|||
func (c *Collection) GetLoadType() querypb.LoadType |
|||
func NewCollection(collectionID int64, schema *schemapb.CollectionSchema, loadType querypb.LoadType) *Collection |
|||
func DeleteCollection(collection *Collection) |
|||
``` |
|||
|
|||
## PipelineManager |
|||
```Go |
|||
type PipelineManager struct { |
|||
} |
|||
|
|||
func (m *PipelineManager) Num() int |
|||
func (m *PipelineManager) Add(collectionID UniqueID, dmlChannels []string) error |
|||
func (m *PipelineManager) Get(collectionID UniqueID, channel Channel) (*Pipeline, error) |
|||
func (m *PipelineManager) Remove(channels []Channel) |
|||
func (m *PipelineManager) Close() |
|||
``` |
|||
|
|||
## Test Plan |
|||
|
|||
### Unit tests |
|||
|
|||
All packages in querynode v2 coverage about 80% |
|||
|
|||
### E2E Tests |
|||
|
|||
All existing load/release/search/query test cases passes. |
|||
|
|||
### Integration Tests |
|||
|
|||
- Worker delete failed test cases |
|||
- Worker offline test cases |
|||
|
|||
## References |
|||
|
|||
None |
|||
|
@ -0,0 +1,52 @@ |
|||
# MEP: Add collection level auto compaction config |
|||
|
|||
Current state: In Progress |
|||
|
|||
ISSUE: [[Enhancement]: Support collection level config to disable auto-compaction #23993](https://github.com/milvus-io/milvus/issues/23993) |
|||
|
|||
Keywords: Collection, Compaction, Config |
|||
|
|||
Released: N/A |
|||
|
|||
## Summary |
|||
|
|||
Compaction has a config item to control whether auto-compaction is enabled or not. This configuration is global and impacts all collections in system. |
|||
|
|||
In some scenarios, we might want to control the granularity of auto-compaction switch so that it could be achieved that: |
|||
|
|||
- Disable auto-compaction during importing data to prevent rebuilt indexes |
|||
- Disable auto-compaction during some test cases to make system behavior stable |
|||
|
|||
## Design |
|||
|
|||
Add collection level attribute, attribute key is "collection.autocompaction.enabled"(see also pkg/common/common.go). |
|||
|
|||
While handling all compaction signal, check collection level configuration: |
|||
|
|||
- If not set, use global auto-compaction setting |
|||
- If config is valid, use collection level setting |
|||
- If config value is invalid, fallback to global setting |
|||
|
|||
|
|||
## How to change this setting |
|||
|
|||
All collection-level attribute could be changed by `AlterCollection` API |
|||
|
|||
## Test Plan |
|||
|
|||
### Unit tests |
|||
|
|||
Add unit tests for collection level auto compaction switch. |
|||
|
|||
### E2E Tests |
|||
|
|||
Change some case to disable collection auto compaction to rectify test case behavior. |
|||
|
|||
### Integration Tests |
|||
|
|||
- Add test to check auto compaction disabled |
|||
|
|||
## References |
|||
|
|||
None |
|||
|
@ -0,0 +1,121 @@ |
|||
# MEP: Datanode remove dependency of `Datacoord` |
|||
|
|||
Current state: "Accepted" |
|||
|
|||
ISSUE: https://github.com/milvus-io/milvus/issues/26758 |
|||
|
|||
Keywords: datacoord, datanode, flush, dependency, roll-upgrade |
|||
|
|||
## Summary |
|||
|
|||
Remove the dependency of `Datacoord` for `Datanodes`. |
|||
|
|||
## Motivation |
|||
|
|||
1. Datanodes shall be always be running even when the data coordinator is not alive |
|||
|
|||
If datanodes performs `sync` during rolling upgrade, it needs datacoord to change the related meta in metastore. If datacoord happens to be offline or it is during some period of rolling-upgrade, datanode has to panic to ensure there is no data lost. |
|||
|
|||
2. Flush operation is complex and error-prone due since the whole procedure involves datacoord, datanodes and grpc |
|||
|
|||
This proposal means to remove the dependency of datacoord ensuring: |
|||
|
|||
- the data is integrate and no duplicate data is kept in records |
|||
- no compatibility issue during or after rolling upgrade |
|||
- `Datacoord` shall be able to detect the segment meta updates and provides recent targets for `QueryCoord` |
|||
|
|||
## Design Details |
|||
|
|||
The most brief description if this proposal is to: |
|||
|
|||
- Make `Datanode` operating the segment meta directly |
|||
- Make `Datacoord` refresh the latest segment change periodically |
|||
|
|||
|
|||
### Preventing multiple writers |
|||
|
|||
There is a major concern that if multiple `Datanodes` are handling the same dml channel, there shall be only one `DataNode` could update segment meta successfully. |
|||
|
|||
This guarantee is previously implemented by singleton writer in `Datacoord`: it checks the valid watcher id before update the segment meta when receiving the `SaveBinlogPaths` grpc call. |
|||
|
|||
In this proposal, `DataNodes` update segment meta on its own, so we need to introduce a new mechanism to prevent this error from happening: |
|||
|
|||
{% note %} |
|||
|
|||
**Note:** Like the "etcd lease for key", the ownership of each dml channel is bound to a lease id. This lease id shall be recorded in metastore (etcd/tikv or any other implementation). |
|||
When a `DataNode` start to watch a dml channel, it shall read this lease id (via etcd or grpc call). ANY operations on this dml channel shall under a transaction with the lease id is equal to previously read value. |
|||
If a `datanode` finds the lease id is revoke or updated, it shall close the flowgraph/pipeline and cancel all pending operations instead of panicking. |
|||
|
|||
{% endnote %} |
|||
|
|||
- [] Add lease id field in etcd channel watch info/ grpc watch request |
|||
- [] Add `TransactionIf` like APIs in `TxnKV` interface |
|||
|
|||
### Updating channel checkpoint |
|||
|
|||
Likewise, all channel checkpoints update operations are performed by `Datacoord` invoking by grpc calls from `DataNodes`. So it has the same problem in previously stated scenarios. |
|||
|
|||
So, "updating channel checkpoint" shall also be processed in `DataNodes` while removing the dependency of `DataCoord`. |
|||
|
|||
The rules system shall follow is: |
|||
|
|||
{% note %} |
|||
|
|||
**Note:** Segments meta shall be updated *BEFORE* changing the channel checkpoint in case of datanode crashing during the prodedure. Under this premise, reconsuming from the old checkpoint shall recover all the data and duplidated entries will be discarded by segment checkpoints. |
|||
|
|||
{% endnote %} |
|||
|
|||
### Updating segment status in `DataCoord` |
|||
|
|||
As previous described, `DataCoord` shall refresh the segment meta and channel checkpoint periodically to provide recent target for `QueryCoord`. |
|||
|
|||
The `watching via Etcd` strategy is ruled out first since `Watch` operation shall avoided in the future design: currently Milvus system tends to not use `Watch` operation and try to remove it from metastore. |
|||
Also `Watch` is heavy and has caused lots of issue before. |
|||
|
|||
The winning option is to: |
|||
|
|||
{% note %} |
|||
|
|||
**Note:** `Datacoord` reloads from metastore periodically. |
|||
Optimization 1: reload channel checkpoint first, then reload segment meta if newly read revision is greater than in-memory one. |
|||
Optimization 2: After `L0 segment` is implemented, datacoord shall refresh growing segments only. |
|||
|
|||
{% endnote %} |
|||
|
|||
|
|||
## Compatibility, Deprecation, and Migration Plan |
|||
|
|||
This change shall guarantee that: |
|||
|
|||
- When new `Datacoord` starts, it shall be able to upgrade the old watch info and add lease id into it |
|||
- For watch info, release then watch |
|||
- For grpc, `release then watch` is the second choice, try call watch with lease id |
|||
- Older `DataNodes` could invoking `SaveBinlogPaths` and other legacy grpc calls without panicking |
|||
- The new `DataNodes` receiving old watch request(without lease id) shall fallback to older strategy, which is to update meta via grpc |
|||
- `SaveBinlogPaths`, `UpdateChannelCheckpoints` APIs shall be kept until next break change |
|||
|
|||
## Test Plan |
|||
|
|||
### Unit test |
|||
Coverage over 90% |
|||
|
|||
### Integration Test |
|||
|
|||
#### Datacoord offline |
|||
|
|||
1. Insert data without datanodes online |
|||
2. Start datanodes |
|||
3. Make datacoord go offline after channel assignment |
|||
4. Assert no datanode panicking and all data shall be intact |
|||
5. Bring back datacoord and test `GetRecoveryInfo`, which shall returns latest target |
|||
|
|||
|
|||
#### Compatibility |
|||
|
|||
1. Start mock datacoord |
|||
2. construct a watch info (without lease) |
|||
3. Datanode start to watch dml channel and all meta update shall be performed via grpc |
|||
|
|||
## Rejected Alternatives |
|||
|
|||
DataCoord refresh meta via Etcd watch |
@ -0,0 +1,297 @@ |
|||
# Milvus Row Level Security (RLS) Design Document |
|||
|
|||
## ✨ Overview |
|||
|
|||
Row Level Security (RLS) provides fine-grained access control at the row level for collections in Milvus. By enabling RLS and defining policies based on user identity, roles, or dynamic tags, administrators can enforce data access restrictions without modifying application logic or data structures. |
|||
|
|||
--- |
|||
|
|||
## ⚙️ Core Capabilities |
|||
|
|||
| Feature | Description | |
|||
|-----------------------|--------------------------------------------------------------| |
|||
| Enable/Disable RLS | Toggle RLS at the collection level with runtime control | |
|||
| Enforce RLS | Enforce RLS even for superusers and administrators | |
|||
| Policy Definition | Define policies based on user ID, roles, field values, or tags | |
|||
| Multi-policy Support | Support for multiple policies per action/role combination | |
|||
| User Tag Mechanism | Use dynamic user metadata for flexible access filtering | |
|||
| Expression Language | Rich expression syntax for complex access control rules | |
|||
|
|||
--- |
|||
|
|||
## 🔖 User Tag Mechanism |
|||
|
|||
RLS leverages runtime user context including `$current_user_name` and `$current_user_tags` to evaluate access policies dynamically. |
|||
|
|||
### ✅ Setting User Tags |
|||
|
|||
```python |
|||
client.set_user_tags( |
|||
user="user_abc", |
|||
tags={ |
|||
"department": "engineering", |
|||
"region": "us-west-1", |
|||
"tenant": "customer_a", |
|||
"security_level": "confidential" |
|||
} |
|||
) |
|||
``` |
|||
|
|||
### ✅ Tag Management APIs |
|||
|
|||
| API | Description | |
|||
| ---------------------------- | ---------------------------------------------- | |
|||
| `set_user_tags(user, tags)` | Set or update user tags (overwrites existing) | |
|||
| `delete_user_tag(user, key)` | Delete a specific tag key for a user | |
|||
| `get_user_tags(user)` | Fetch all user tag information | |
|||
| `list_users_with_tag(key, value)` | Find users with specific tag values | |
|||
|
|||
Tags can be referenced in policy expressions using the following syntax: |
|||
|
|||
```python |
|||
using_expr="region == $current_user_tags['region']" |
|||
check_expr="security_level >= $current_user_tags['clearance']" |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## 🛠️ API Design |
|||
|
|||
### 1. Enable or Disable RLS |
|||
|
|||
```python |
|||
# Enable RLS for a collection |
|||
client.alter_collection_properties( |
|||
collection="my_collection", |
|||
properties={"rls.enabled": True} |
|||
) |
|||
|
|||
# Disable RLS for a collection |
|||
client.alter_collection_properties( |
|||
collection="my_collection", |
|||
properties={"rls.enabled": False} |
|||
) |
|||
``` |
|||
|
|||
### 2. Enforce RLS (even for superusers) |
|||
|
|||
```python |
|||
client.alter_collection_properties( |
|||
collection="my_collection", |
|||
properties={ |
|||
"rls.enabled": True, |
|||
"rls.force": True # Applies to all users including superusers |
|||
} |
|||
) |
|||
``` |
|||
|
|||
### 3. Create an RLS Policy |
|||
|
|||
```python |
|||
client.create_row_policy( |
|||
collection="user_documents", |
|||
policy_name="limit_to_user", |
|||
actions=["query", "insert", "delete", "update"], |
|||
roles=["$current_user", "user_role"], |
|||
using_expr="user_id == $current_user_name", |
|||
check_expr="user_id == $current_user_name", |
|||
description="Restrict users to their own documents" |
|||
) |
|||
``` |
|||
|
|||
**Policy Parameters:** |
|||
- `collection`: Target collection name |
|||
- `policy_name`: Unique identifier for the policy |
|||
- `actions`: List of operations this policy applies to (`query`, `insert`, `delete`, `update`) |
|||
- `roles`: List of roles this policy applies to (`$current_user`, `admin`, custom roles) |
|||
- `using_expr`: Expression for filtering data during queries |
|||
- `check_expr`: Expression for validating data during mutations |
|||
- `description`: Optional human-readable description |
|||
|
|||
### 4. Delete an RLS Policy |
|||
|
|||
```python |
|||
client.drop_row_policy( |
|||
collection="user_documents", |
|||
policy_name="limit_to_user" |
|||
) |
|||
``` |
|||
|
|||
### 5. List All RLS Policies |
|||
|
|||
```python |
|||
policies = client.list_row_policies(collection="user_documents") |
|||
# Example response: |
|||
# [ |
|||
# { |
|||
# "policy_name": "limit_to_user", |
|||
# "using_expr": "user_id == $current_user_name", |
|||
# "check_expr": "user_id == $current_user_name", |
|||
# "roles": ["$current_user"], |
|||
# "actions": ["query", "insert", "delete"], |
|||
# "description": "Restrict users to their own documents", |
|||
# "created_at": "2024-01-15T10:30:00Z" |
|||
# } |
|||
# ] |
|||
``` |
|||
|
|||
### 6. Get Collection RLS Status |
|||
|
|||
```python |
|||
status = client.get_collection_properties( |
|||
collection="user_documents", |
|||
properties=["rls.enabled", "rls.force"] |
|||
) |
|||
# Returns: {"rls.enabled": True, "rls.force": False} |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## ✅ Usage Examples |
|||
|
|||
### Example 1: Users Can Only Access Their Own Data |
|||
|
|||
**Scenario:** A document management system where users should only see and modify their own documents. |
|||
|
|||
**Collection Schema:** |
|||
```python |
|||
# Collection includes a user_id field |
|||
{ |
|||
"user_id": "string", |
|||
"document_name": "string", |
|||
"content": "string", |
|||
"created_at": "timestamp" |
|||
} |
|||
``` |
|||
|
|||
**RLS Policy:** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="user_documents", |
|||
policy_name="user_own_data", |
|||
actions=["query", "insert", "delete", "update"], |
|||
roles=["$current_user"], |
|||
using_expr="user_id == $current_user_name", |
|||
check_expr="user_id == $current_user_name", |
|||
description="Users can only access their own documents" |
|||
) |
|||
``` |
|||
|
|||
--- |
|||
|
|||
### Example 2: Role-Based Access Control |
|||
|
|||
**Scenario:** Admins have full access, managers see department data, users see only their own data. |
|||
|
|||
**User Policy (restricted):** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="employee_records", |
|||
policy_name="user_scope", |
|||
actions=["query", "insert", "delete", "update"], |
|||
roles=["$current_user"], |
|||
using_expr="employee_id == $current_user_name", |
|||
check_expr="employee_id == $current_user_name" |
|||
) |
|||
``` |
|||
|
|||
**Manager Policy (department scope):** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="employee_records", |
|||
policy_name="manager_scope", |
|||
actions=["query", "insert", "update"], |
|||
roles=["manager"], |
|||
using_expr="department == $current_user_tags['department']", |
|||
check_expr="department == $current_user_tags['department']" |
|||
) |
|||
``` |
|||
|
|||
**Admin Policy (full access):** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="employee_records", |
|||
policy_name="admin_full_access", |
|||
actions=["query", "insert", "delete", "update"], |
|||
roles=["admin"], |
|||
using_expr="true", |
|||
check_expr="true" |
|||
) |
|||
``` |
|||
|
|||
--- |
|||
|
|||
### Example 3: Multi-Tenant Data Isolation |
|||
|
|||
**Scenario:** SaaS application with tenant-based data isolation using user tags. |
|||
|
|||
**Policy:** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="customer_data", |
|||
policy_name="tenant_isolation", |
|||
actions=["query", "insert", "delete", "update"], |
|||
roles=["$current_user"], |
|||
using_expr="tenant_id == $current_user_tags['tenant']", |
|||
check_expr="tenant_id == $current_user_tags['tenant']" |
|||
) |
|||
``` |
|||
|
|||
**User Tag Setup:** |
|||
```python |
|||
client.set_user_tags( |
|||
user="user_123", |
|||
tags={"tenant": "acme_corp", "role": "analyst"} |
|||
) |
|||
``` |
|||
|
|||
--- |
|||
|
|||
### Example 4: Time-Based Access Control |
|||
|
|||
**Scenario:** Documents are only accessible during business hours for non-admin users. |
|||
|
|||
**Policy:** |
|||
```python |
|||
client.create_row_policy( |
|||
collection="sensitive_documents", |
|||
policy_name="business_hours_access", |
|||
actions=["query"], |
|||
roles=["$current_user"], |
|||
using_expr="(hour(now()) >= 9 AND hour(now()) <= 17) OR $current_user_tags['role'] == 'admin'", |
|||
check_expr="true" |
|||
) |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## 🔒 Security Model Notes |
|||
|
|||
### Policy Evaluation |
|||
- **OR Logic**: All policies for a user are OR-combined - if any policy grants access, the operation is allowed |
|||
- **Action-Specific**: Policies are evaluated based on the specific action being performed |
|||
- **Role Matching**: Users must have at least one role that matches the policy's role list |
|||
|
|||
### Access Control Levels |
|||
- **Default Behavior**: RLS applies only to non-superusers |
|||
- **Force Mode**: With `rls.force=True`, RLS applies to everyone including superusers and administrators |
|||
- **Bypass Options**: Superusers can temporarily bypass RLS for maintenance operations |
|||
|
|||
### Expression Language |
|||
- **Field References**: Use field names directly in expressions |
|||
- **Variables**: `$current_user_name`, `$current_user_tags`, `$current_roles` |
|||
- **Functions**: Support for common functions like `now()`, `hour()`, `date()` |
|||
- **Operators**: Standard comparison and logical operators |
|||
|
|||
### Performance Considerations |
|||
- **Index Usage**: RLS expressions should leverage indexed fields for optimal performance |
|||
- **Expression Complexity**: Complex expressions may impact query performance |
|||
- **Policy Count**: Large numbers of policies per collection may affect evaluation speed |
|||
|
|||
### Best Practices |
|||
- **Principle of Least Privilege**: Start with restrictive policies and gradually expand access |
|||
- **Regular Auditing**: Periodically review and test RLS policies |
|||
- **Documentation**: Maintain clear documentation of policy purposes and effects |
|||
- **Testing**: Test policies with various user roles and scenarios before production deployment |
|||
|
|||
|
After Width: | Height: | Size: 913 KiB |
After Width: | Height: | Size: 72 KiB |
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 70 KiB |
After Width: | Height: | Size: 82 KiB |
After Width: | Height: | Size: 110 KiB |
After Width: | Height: | Size: 105 KiB |
After Width: | Height: | Size: 28 KiB |
After Width: | Height: | Size: 226 KiB |
After Width: | Height: | Size: 25 KiB |
After Width: | Height: | Size: 96 KiB |
After Width: | Height: | Size: 228 KiB |
After Width: | Height: | Size: 296 KiB |
After Width: | Height: | Size: 261 KiB |
After Width: | Height: | Size: 46 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 231 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 242 KiB |
After Width: | Height: | Size: 103 KiB |
After Width: | Height: | Size: 232 KiB |
After Width: | Height: | Size: 96 KiB |
After Width: | Height: | Size: 162 KiB |
After Width: | Height: | Size: 99 KiB |
After Width: | Height: | Size: 499 KiB |
After Width: | Height: | Size: 192 KiB |
After Width: | Height: | Size: 181 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 21 KiB |
After Width: | Height: | Size: 42 KiB |
After Width: | Height: | Size: 368 KiB |
After Width: | Height: | Size: 558 KiB |
After Width: | Height: | Size: 14 KiB |
After Width: | Height: | Size: 48 KiB |
After Width: | Height: | Size: 55 KiB |
After Width: | Height: | Size: 84 KiB |
After Width: | Height: | Size: 197 KiB |
@ -0,0 +1,116 @@ |
|||
# JSON Storage Design Document |
|||
|
|||
## 1. Data Model Design |
|||
|
|||
### 1.1 Data Layering |
|||
|
|||
#### Dense Part |
|||
A set of "core fields" (such as primary keys and commonly used metadata) that are present in most records. |
|||
|
|||
#### Sparse Part |
|||
Additional attributes that appear only in some records, potentially involving unstructured or dynamically extended information. |
|||
|
|||
### 1.2 JSON Splitting and Mapping |
|||
|
|||
#### Dense Field Extraction |
|||
When parsing JSON, predefined dense fields are extracted and mapped to independent columns in Parquet. A method similar to Parquet Variant Shredding is used to flatten nested data. |
|||
|
|||
#### Sparse Data Preservation |
|||
Fields not included in the dense part are stored in a sparse data field. They are serialized using BSON (Binary JSON) format, leveraging its efficient binary representation and rich data type support, with the result stored in a Parquet BINARY type field. |
|||
|
|||
## 2. Storage Strategy |
|||
|
|||
### 2.1 Columnar Storage for Dense Data |
|||
- **Schema Definition**: Create independent columns in Parquet for each dense field, explicitly specifying data types (such as numeric, string, list, etc.). |
|||
- **Query Performance**: Columnar format is suitable for large data scanning and aggregation operations, improving query efficiency, especially for vectors, indexes, and frequently queried fields. |
|||
|
|||
### 2.2 Row Storage for Sparse Data |
|||
- **BSON Storage**: |
|||
- Serialize sparse data as BSON binary format and store it in a single binary column of the Parquet file. |
|||
- BSON format not only compresses more efficiently but also preserves complete data type information of the original data, avoiding numerous null values and file fragmentation issues. |
|||
|
|||
## 3. Parquet Schema Construction |
|||
- **Columnar Part**: Build a fixed schema based on dense fields, with each field having a clear data type definition. |
|||
- **Row Part**: Define a dedicated field (e.g., `sparse_data`) for storing sparse data, with type set to BINARY, directly storing BSON data. |
|||
- **Hybrid Mode**: When writing, dense data is filled into respective columns, and remaining sparse data is serialized as BSON and written to the `sparse_data` field, achieving a balance between query efficiency and storage flexibility. |
|||
|
|||
## 4. Integration and Implementation Considerations |
|||
|
|||
### 4.1 Data Classification Strategy |
|||
- **Density Classification**: |
|||
- Classify fields as dense or sparse based on their frequency of occurrence in records (e.g., greater than 30% for dense), while considering data type consistency. If a field has multiple data types, we should treat data types that appear in more than 30% of records as dense fields, with the remaining types stored as sparse fields. |
|||
- **Dynamic Extension**: |
|||
- For dynamically extended fields, regardless of frequency, store them in the BSON-formatted sparse part to simplify schema evolution. |
|||
|
|||
### 4.2 Indexing for Sparse Data Access |
|||
|
|||
#### Sparse Column Key Indexing |
|||
To accelerate BSON parsing, an inverted index stores BSON keys along with their offsets and sizes or values if they are of numeric type. |
|||
|
|||
##### Value Data Structure Diagram |
|||
| Valid | Type | Row ID | Offset/Value | |
|||
|:-----:|:-----:|:------:|:------------:| |
|||
| 1bit | 4bit | 27bit | 16 offset, 16bit size | |
|||
|
|||
- **64-bit Structure Breakdown**: |
|||
- **Bit 1 (Valid)**: 1 bit indicating data validity (1 = valid, 0 = invalid). |
|||
- **Bits 2-5 (Type)**: 4 bits representing the data type. |
|||
- **Bits 5-31 (Row ID)**: 27 bits for the row ID, uniquely identifying the data row. |
|||
- **Bits 32-64 (Last 32 bits)**: |
|||
- If **Valid = 1**: Last 32 bits store the actual data value. |
|||
- If **Valid = 0**: Last 32 bits are split into: |
|||
- **First 16 bits (Offset)**: Indicates the data offset position. |
|||
- **Last 16 bits (Size)**: Indicates the data size. |
|||
|
|||
The column key index is optional, and can be configured at table creation time or modified later through field properties. |
|||
|
|||
## 5. Example Data |
|||
|
|||
### 5.1 Example JSON Records |
|||
|
|||
```json |
|||
[ |
|||
{"id": 1, "attr1": "value1", "attr2": 100}, |
|||
{"id": 2, "attr1": "value2", "attr3": true}, |
|||
{"id": 3, "attr1": "value3", "attr4": "extra", "attr5": 3.14} |
|||
] |
|||
``` |
|||
|
|||
- **Dense Data:** |
|||
- The field `id` is considered dense. |
|||
- **Sparse Data:** |
|||
- Record 1: `attr1`, `attr2` |
|||
- Record 2: `attr1`, `attr3` |
|||
- Record 3: `attr1`, `attr4`, `attr5` |
|||
|
|||
### 5.2 Parquet File Storage |
|||
|
|||
#### Schema Representation |
|||
|
|||
| Column Name | Data Type | Description | |
|||
|--------------|-----------|-------------| |
|||
| **id** | int64 | Dense column storing the integer identifier. | |
|||
| **sparse_data** | binary | Sparse column storing BSON-serialized data of all remaining fields. | |
|||
| **sparse_index** | binary | Index column storing key offsets for efficient parsing. | |
|||
|
|||
#### Stored Data Breakdown |
|||
|
|||
- **Dense Column (`id`)**: |
|||
- Row 1: `1` |
|||
- Row 2: `2` |
|||
- Row 3: `3` |
|||
|
|||
- **Sparse Column (`sparse_data`)**: |
|||
- **Row 1:** BSON representation of `{"attr1": "value1", "attr2": 100}` |
|||
- **Row 2:** BSON representation of `{"attr1": "value2", "attr3": true}` |
|||
- **Row 3:** BSON representation of `{"attr1": "value3", "attr4": "extra", "attr5": 3.14}` |
|||
|
|||
- **Sparse Index (`sparse_index`)**: |
|||
- **Row 1:** Index entries mapping `attr1` and `attr2` to their respective positions in `sparse_data`. |
|||
- **Row 2:** Index entries mapping `attr1` and `attr3`. |
|||
- **Row 3:** Index entries mapping `attr1`, `attr4`, and `attr5`. |
|||
|
|||
In an actual system, the sparse data would be serialized using a BSON library (e.g., bsoncxx) for a compact binary format. The example above demonstrates the logical mapping of JSON data to the Parquet storage format. |
|||
|
|||
--- |
|||
|
@ -0,0 +1,142 @@ |
|||
# Primary Key Index Design Document |
|||
|
|||
## 1. Introduction |
|||
|
|||
This document outlines the design of Milvus' primary key indexing system, which enables fast lookups of string or integer primary keys across multiple segments. The index will be loaded in the Delegator and persisted in S3 storage. |
|||
|
|||
## 2. Objectives and Benefits |
|||
|
|||
1. **Deduplication**: Identify duplicate data during write operations, automatically converting them to Insert + Delete operations |
|||
2. **Accelerate Partial Updates**: Improve performance of partial upsert and point query operations |
|||
3. **Optimize Delete Forwarding**: Reduce Bloom Filter check overhead in the Delegator during Delete operations |
|||
|
|||
## 3. Design Overview |
|||
|
|||
### 3.1 Core Components |
|||
|
|||
1. **BBhash**: A space-efficient hash structure that maps keys to a continuous range of integers without collisions. The master branch works with Plain Old Data types (POD), while the "alltypes" branch supports other types including strings. |
|||
2. **Value Array**: A memory-mapped array storing segment position information for each primary key. |
|||
|
|||
### 3.2 Architecture Details |
|||
|
|||
1. **BBhash**: |
|||
BBhash is a minimal perfect hash library for static key collections, capable of mapping each key to a unique, compact integer index. For example: |
|||
|
|||
- "user123" → 0 |
|||
- "user456" → 1 |
|||
- "user789" → 2 |
|||
|
|||
For string primary keys, BBhash processes the raw byte sequence directly without type conversion and supports variable-length strings. Key features include: |
|||
- No need to store original strings |
|||
- Full content hashing reduces collision probability |
|||
- Extremely low memory usage |
|||
|
|||
2. **Value Array**: |
|||
This array stores segment metadata for each primary key. It can be accessed directly using the BBhash mapping result, providing **O(1)** query efficiency: |
|||
|
|||
3. **example code** |
|||
```cpp |
|||
// Example code for building and using the primary key index |
|||
|
|||
// Building the index |
|||
void buildPrimaryKeyIndex(const std::vector<std::string>& keys, const std::vector<SegmentInfo>& segmentInfos) { |
|||
// Initialize BBhash with the keys |
|||
bbhash::PerfectHasher<std::string> hasher(keys); |
|||
|
|||
// Initialize value array with appropriate size |
|||
std::vector<SegmentInfo> valueArray(keys.size()); |
|||
|
|||
// Populate value array with segment information |
|||
for (size_t i = 0; i < keys.size(); i++) { |
|||
size_t index = hasher.lookup(keys[i]); |
|||
valueArray[index] = segmentInfos[i]; |
|||
} |
|||
|
|||
// Persist the index to storage |
|||
hasher.save("bbhash.idx"); |
|||
saveValueArray(valueArray, "value_array.bin"); |
|||
} |
|||
|
|||
// Reading from the index |
|||
SegmentInfo lookupPrimaryKey(const std::string& key) { |
|||
// Load BBhash from storage (or use cached instance) |
|||
bbhash::PerfectHasher<std::string> hasher; |
|||
hasher.load("bbhash.idx"); |
|||
|
|||
// Load value array (or use memory-mapped instance) |
|||
std::vector<SegmentInfo> valueArray = loadValueArray("value_array.bin"); |
|||
|
|||
// Lookup the key |
|||
size_t index = hasher.lookup(key); |
|||
if (index != bbhash::NOT_FOUND) { |
|||
return valueArray[index]; |
|||
} |
|||
|
|||
return SegmentInfo(); // Return empty segment info if not found |
|||
} |
|||
``` |
|||
|
|||
## 4. Index Structure Illustration |
|||
|
|||
### 4.1 BBhash Workflow |
|||
|
|||
BBhash (Bin Bloom Hash) maps keys to unique indices through multi-level hash functions: |
|||
|
|||
1. The first level hash attempts to map all keys to non-conflicting positions |
|||
2. For keys with conflicts, a next-level hash function is used for remapping |
|||
3. This process iterates until all keys are mapped without conflicts |
|||
|
|||
### 4.2 Value Array Storage Structure |
|||
|
|||
Each entry in the value array contains: |
|||
- Segment ID (pointing to the segment containing the primary key) |
|||
|
|||
 |
|||
|
|||
For L1 Segments, we don't need primary key indexing and can use Bloom Filters for approximate filtering with false positives. For L2 Segments, we build PK → Segment mappings for data under each bucket. Note that false positives still exist here due to: 1. Data that has been deleted, and 2. BBhash's small probability of false positives (approximately 1/2³² ≈ 2.3×10⁻¹⁰). |
|||
|
|||
3. **Memory Efficiency**: |
|||
- BBhash: 2–4 bits/key (1B keys ≈ 250–500MB) |
|||
- Value Array: ~4 bytes/key (Segment ID) |
|||
- Total: ~4.5 bytes/key → 1B keys ≈ 4.5GB |
|||
- mmap implementation allows the operating system to load and reclaim memory as needed, supporting billion-scale datasets |
|||
|
|||
### 3.3 Performance Analysis |
|||
|
|||
#### 3.3.1 Index Building Performance |
|||
|
|||
- **Single-thread Performance**: BBhash constructs a minimal perfect hash function (MPHF) for 100 million keys in about 10 seconds on a single thread, processing approximately 10 million keys/second |
|||
- **Multi-thread Scalability**: Using 8 threads, building an MPHF for 1 billion keys takes about 35 seconds, averaging approximately 28.57 million keys/second |
|||
- **Billion-scale Construction Feasibility**: |
|||
- On a 32-core server, theoretical time to build a 1 billion key index is about 10-15 seconds |
|||
- In actual testing, end-to-end time including data reading and index construction reaches 1 minute |
|||
- Peak memory usage does not exceed 16GB |
|||
|
|||
#### 3.3.2 Query Performance Comparison |
|||
|
|||
- **Single Primary Key Index vs. Multiple Bloom Filters**: |
|||
- **Query Latency**: |
|||
- Primary Key Index: ~200 nanoseconds per query |
|||
- 10,000 Bloom Filters: Sequential querying required, average latency ~10,000 × 10 nanoseconds = 0.1 milliseconds |
|||
- **Performance Gap**: Primary key index query speed is approximately 500 times faster than the Bloom filter approach |
|||
|
|||
- **Throughput**: |
|||
- Primary Key Index: ~10-20 million queries per second per node |
|||
- Bloom Filter Approach: ~1,000 queries per second per node |
|||
- **Advantage**: Primary key index supports higher query loads in high-concurrency scenarios |
|||
|
|||
#### 3.3.3 Precision Comparison |
|||
|
|||
- **BBhash Precision**: |
|||
- Actual implementation may have an extremely small probability of hash collisions, but far lower than Bloom filters |
|||
|
|||
- **Bloom Filter Precision**: |
|||
- Single Bloom filter false positive rate is typically set to 0.1% |
|||
- Cumulative false positive rate when querying 10,000 Bloom filters approaches 100% |
|||
|
|||
## 4. Additional Considerations |
|||
|
|||
1. Performance validation, including index construction and querying, comparing BBhash and other libraries such as CMPH |
|||
2. Whether BBHash can also replace Bloom filters for individual Segments |
|||
3. How to handle false positives - ignore? verify in each segment |
|||
4. Value index redundancy fields for point query optimization, such as recording additional offset information or even fields |
@ -0,0 +1,36 @@ |
|||
# Segcore Search Design |
|||
|
|||
init: 7.23.2021, by [FluorineDog](https://github.com/FluorineDog) |
|||
|
|||
update: 2.10.2022, by [zhuwenxing](https://github.com/zhuwenxing) |
|||
|
|||
## Search |
|||
|
|||
Search now supports two modes: json DSL mode and Boolean Expr mode. We will talk about the latter one in detail because the former has been deprecated and is only used in tests. |
|||
|
|||
The execution mode of Boolean Expr works as follows: |
|||
|
|||
1. client packs search expr, topk, and query vector into proto and sends to Proxy node. |
|||
2. Proxy Node unmarshals the proto, parses it to logical plan, makes a static check, and generates protobuf IR. |
|||
3. Query Node unmarshals the plan, generates an executable plan AST, and queries in the segcore. |
|||
|
|||
See details of expression usage at [Boolean Expression Rules](https://milvus.io/docs/v2.0.0/boolean.md) |
|||
|
|||
## Segcore Search Process |
|||
|
|||
After obtaining the AST, the execution engine uses the visitor mode to explain and executes the whole AST tree: |
|||
|
|||
1. Each node includes two steps, a mandatory vector search and an optional predicate. |
|||
|
|||
1. If Predicate exists, execute predicate expression stage to generate bitset as the vector search bitmask. |
|||
2. If Predicate does not exist, the vector search bitmask will be empty. |
|||
3. Bitmask will be used to mark filtered out / deleted entities in the vector execution engine. |
|||
|
|||
2. Currently, Milvus supports the following node on the AST, visitor mode is used to interpret and execute from top to bottom and generate the final bitmask. |
|||
|
|||
1. LogicalUnaryExpr: not expression |
|||
2. LogicalBinaryExpr: and or expression |
|||
3. TermExpr: in expression `A in [1, 2, 3]` |
|||
4. CompareExpr: compare expression `A > 1` `B <= 1` |
|||
|
|||
3. TermExpr and CompareExpr are leaf nodes of execution. |
@ -0,0 +1,16 @@ |
|||
# Glossary |
|||
|
|||
- `Collection`: Data table containing multiple Segments. |
|||
- `Segment`: The memory structure of storing a piece of data which supports concurrent insertion, deletion, query, index loading, monitoring and statistics. |
|||
- `Schema`: Definition of collection data format, including |
|||
- `vector<FieldMeta>`: Order list of FieldMeta. |
|||
- `isAutoId`: If set to True , the default primary field is `RowId` and it is auto-generated. |
|||
- `primaryKey`: (When `isAutoId = False`) Specify primary key field. |
|||
- `FieldMeta`: Field properties, including |
|||
- `DataType`: Data type, including Int8...Int64, Float, Double, FloatVector, BinaryVector and String later. |
|||
- `Dim`: (When dataType is vector) Type vector dimension. |
|||
- `metric_type`: (When dataType is vector type, optional) The metric type corresponding to this vector is related to the small-batch index and can be empty. |
|||
- `FieldName`: Column name. |
|||
- `FieldId`: Unique number of the column. |
|||
- (hidden) `FieldOffset`: Which is the subscript of `vector<Field>` in the schema. The internal calculation of segcore is basically based on field offset. |
|||
- `Span`: Similar to STD::span. It supports vector type of data and can be implicitly converted to `SpanBase` for interface overwrite. |
@ -0,0 +1,19 @@ |
|||
# Scripts and Tools |
|||
|
|||
The following scripts and commands may be used during segcore development. |
|||
|
|||
## code format |
|||
|
|||
- under milvus/internal/core directory |
|||
- run `./run_clang_format.sh .` to format cpp code |
|||
- to call clang-format-12, need to install `apt install clang-format-12` in advance |
|||
- call `build-support/add_${lang}_license.sh` to add license info for cmake and cpp files |
|||
- under milvus/ directory |
|||
- use `make cppcheck` to check format, including |
|||
- if clang-format is executed |
|||
- if license info is added |
|||
- if `cpplint.py` standard meets , might need to be fixed by hand |
|||
- `make verifier` also includes functions in `make cppcheck` |
|||
|
|||
- under milvus/ directory |
|||
- use `make static-check` to check golang code format |
@ -0,0 +1,63 @@ |
|||
# SegmentGrowing |
|||
|
|||
Growing segment has the following additional interfaces: |
|||
|
|||
1. `PreInsert(size) -> reservedOffset`: serial interface, which reserves space for future insertion and returns the `reservedOffset`. |
|||
|
|||
2. `Insert(reservedOffset, size, ...Data...)`: write `...Data...` into range `[reservedOffset, reservedOffset + size)`. This interface is allowed to be called concurrently. |
|||
|
|||
1. `...Data...` contains row_ids, timestamps two system attributes, and other columns |
|||
2. data columns can be stored either row-based or column-based. |
|||
3. `PreDelete & Delete(reservedOffset, row_ids, timestamps)` is a delete interface similar to insert interface. |
|||
|
|||
Growing segment stores data in the form of chunk. The number of rows in each chunk is restricted by configs. |
|||
|
|||
Rows per segment are controlled by parameters `size_per_Chunk ` config |
|||
|
|||
When inserting, first allocate enough space to ensure `total_size <= num_chunk * size_per_chunk`, and then convert data from row format to column format. |
|||
|
|||
During a search, each 'chunk' will be searched, and the search results will be saved as 'subquery result', then reduced into TopK. |
|||
|
|||
Growing Segment also implements small batch index for vectors. The parameters of small batch index are preset in `segcore config` |
|||
|
|||
When `metric type` is specified in the schema, the default parameters will build an index for each chunk to accelerate query |
|||
|
|||
## SegmentGrowingImpl internal |
|||
|
|||
1. SegcoreConfig: contains parameters for Segcore,it has to be specified before create segment |
|||
2. InsertRecord: inserted data put to here |
|||
3. DeleteRecord: wait for delete implementation |
|||
4. IndexingRecord: contains data with small index |
|||
5. SealedIndexing: Record not used anymore |
|||
|
|||
### SegcoreConfig |
|||
|
|||
1. Manage chunk_sizeand small index parameters |
|||
2. `parse_from` can parse from yaml files(this function is not enabled by default) |
|||
- refer to `${milvus}/internal/core/unittest/test_utils/test_segcore.yaml` |
|||
3. `default_config` offers default parameters |
|||
|
|||
### InsertRecord |
|||
|
|||
Used to manage concurrent inserted data, including: |
|||
|
|||
1. `atomic<int64_t> reserved` reserved space calculation |
|||
2. `AckResponder` calculate which segment to insert, returns current segment offset |
|||
3. `ConcurrentVector` stores data columns, each column has one concurrent vector |
|||
|
|||
The following steps are executed when insert, |
|||
|
|||
1. Serially Execute `PreInsert(size) -> reserved_offset` to allocate memory space, the address of space is `[reserved_offset, reserved_offset + size)` is reserved |
|||
2. Parallelly execute `Insert(reserved_offset, size, ...Data...)` interface,copy data into the above memory address |
|||
|
|||
- First of all,for `ConcurrentVector` of each column, call `grow_to_at_least` to reserve space |
|||
- For each column data, call `set_data_raw` interface to put data into corresponding locations. |
|||
- After execution finished,call`AddSegment` of `AckResponder` ,mark the space `[reserved_offset, reserved_offset + size)` to already inserted |
|||
|
|||
### ConcurrentVector |
|||
|
|||
This is a column data storage that can be inserted concurrently. It is composed of multi-data chunks. |
|||
|
|||
1. After`grow_to_at_least(size)` called, reserve space no less than `size` |
|||
2. `set_data_raw(element_offset, source, element_count)` point source to continuous piece of data |
|||
3. `get_span(chunk_id)` get the span of the corresponding chunk |
@ -0,0 +1,48 @@ |
|||
# Segment Interface |
|||
|
|||
## External Interface |
|||
|
|||
1. `get_row_count`: Get the number of entities in the segment |
|||
2. `get_schema`: Get the corresponding collection schema in the segment |
|||
3. `GetMemoryUsageInBytes`: Get memory usage of a segment |
|||
4. `Search(plan, placeholderGroup, timestamp) -> QueryResult`: Perform search operations according to the plan containing search parameters and predicate conditions, and return search results. Ensure that the time of all search results is before the specified timestamp(MVCC) |
|||
5. `FillTargetEntry(plan, &queryResult)`: Fill the missing column data for search results based on target columns in the plan |
|||
|
|||
See design details `${milvus_root}/internal/core/src/segcore/SegmentInterface.h` |
|||
|
|||
## Basic Concepts: |
|||
|
|||
1. Segment: Data is sharded into segments based on written timestamp, and the sharding logic is controlled by data coordinator. |
|||
2. Chunk: Further division of segment data, chunk is continuous data for each column |
|||
- There will be only one chunk in each sealed segment. |
|||
- In growing segment, chunks are currently divided by a fixed number of rows. With data ingestion, the number of chunks will increase |
|||
3. Span: Similar to std::span, point to continuous data in memory |
|||
4. SystemField: Extra field stores system info, currently including RowID and Timestamp field. |
|||
5. SegOffset: The entity identifier in the segment |
|||
|
|||
## SegmentInternalInterface internal functions |
|||
|
|||
1. `num_chunk()`: total chunk number |
|||
2. `size_per_chunk()`: length of each chunk |
|||
3. `get_active_count(Timestamp)`: entity count after filter by Timestamp |
|||
4. `chunk_data(FieldOffset, chunk_id) -> Span<T>`: return continuous data for specified column and chunk |
|||
5. `chunk_scalar_index(FieldOffset, chunk_id) -> const StructuredIndex<T>&`: return the inverted index of specified column and chunk |
|||
6. `num_chunk_index`: the number of indexes (including scalars and vector indexes) that have been created: |
|||
1. In growing segment, this value is the number of chunks for which the inverted index has been created. In these chunks, the index can be used to speed up the calculation. |
|||
2. SealedSegment must be 1 |
|||
7. `debug()`: debug is used to print extra information while debugging |
|||
8. `vector_search (vec_count, query..., timestamp, bitset, output)`: Search the vector column |
|||
1. `vec_count`: specifies how many entities participated in the vector search calculation, the rest of the segments are filtered out because their timestamp is larger than specified timestamp. This function is mainly used in growing segment as multi version control(MVCC) |
|||
2. `query...`: multiple variables jointly specify the parameters and search vector |
|||
3. `timestamp`: timestamp is used for time traveling, filter out data with timestamp. Mainly for sealed segment |
|||
4. `bitset`: calculated bit mask value as an output |
|||
5. `output`: output QueryResult |
|||
9. `bulk_subscript(FieldOffset|SystemField, seg_offsets..., output)`: |
|||
- given seg_offsets, calculate `results[i] = FieldData[seg_offsets[i]]`, for GetEntityByIds |
|||
- FieldData is defined by FieldOffset or SystemField |
|||
10. `search_ids(IdArray, timestamp) -> pair<IdArray, SegOffsets>`: |
|||
1. Find the corresponding segment offsets according to the primary key in an id array |
|||
2. The returned order is not guaranteed, but the two returned fields must correspond to each other one by one. |
|||
3. Entities without PKs will not be returned |
|||
11. `check_search(Plan)`: check if the Plan is valid |
|||
1. It mainly checks whether the columns used in the plan have been loaded |
@ -0,0 +1,25 @@ |
|||
# Segment Overview |
|||
|
|||
There are currently two types of Segments |
|||
1. Growing segment, dynamic insert is allowed, but can not load index for fast retrieving |
|||
2. Sealed segment, dynamic insert is disabled, loading vector index is supported |
|||
|
|||
Both Segment types share the same interface, based on `SegmentInterface`, External callers only need to care about the behavior of the following interface as function declarations and corresponding constructor: |
|||
|
|||
1. `SegmentInterface` |
|||
2. `SegmentGrowing` & `CreateGrowingSegment` |
|||
3. `SegmentSealed` & `CreateSealedSegment` |
|||
|
|||
Other internal functions are hidden as implementation details in the following classes: |
|||
|
|||
1. `SegmentInternalInterface` |
|||
2. `SegmentGrowingImpl` |
|||
3. `SegmentSealedImpl` |
|||
|
|||
In principle, the reusable code logic of growing / sealed is written into the 'SegmentInternalInterface' as far as possible. The different parts of the two classes contain more different parts |
|||
|
|||
See more details about segments at: |
|||
|
|||
1. [segment_interface.md](segment_interface.md) |
|||
2. [segment_growing.md](segment_growing.md) |
|||
3. [segment_sealed.md](segment_sealed.md) |
@ -0,0 +1,40 @@ |
|||
# SegmentSealed |
|||
SegmentSealed has an extra interface rather than SegmentInterface: |
|||
|
|||
1. `LoadIndex(loadIndexInfo)`: load the index. indexInfo contains: |
|||
1. `FieldId` |
|||
2. `IndexParams`: index parameters in KV structure KV |
|||
3. `VecIndex`: vector index |
|||
2. `LoadFieldData(loadFieldDataInfo)`: load column data, could be either scalar column or vector column |
|||
1. Note: indexes and vector data for the same column may coexist. Indexes are prioritized in the search |
|||
3. `DropIndex(fieldId)`: drop and release an existing index of a specified field |
|||
|
|||
Search is executable as long as all the columns involved in the search are loaded. |
|||
|
|||
# SegmentSealedImpl internal data definition |
|||
1. `row_count_opt_`: |
|||
1. Fill row count when loading the first entity |
|||
2. All the other columns loaded must match the same row count |
|||
3. `xxx_ready_bitset_` & `system_ready_count_` |
|||
1. Used to record whether the corresponding column is loaded. Bitset corresponds to FieldOffset |
|||
2. Query is executable if and only if all the following conditions are met: |
|||
1. system_ready_count_ == 2, which means all the system columns' RowId/Timestamp are loaded |
|||
2. The scalar columns involved in the query is loaded |
|||
3. For the vector columns involved in the query, either the original data or the index is loaded |
|||
4. `scalar_indexings_`: store scalar index |
|||
|
|||
1. Use StructuredSortedIndex in Knowhere |
|||
5. `primary_key_index_`: store index for pk column |
|||
1. Use brand new ScalarIndexBase format |
|||
2. **Note: The functions here may overlap with scalar indexes. It is recommended to replace scalar index with ScalarIndexBase** |
|||
6. `field_datas_`: store original data |
|||
1. `aligned_vector<char>` format guarantees `int/float` data are aligned |
|||
7. `SealedIndexingRecord vecindexs_`: store vector index |
|||
8. `row_ids_/timestamps_`: RowId/Timestamp data |
|||
9. `TimestampIndex`: Index for Timestamp column |
|||
10. `schema`: schema |
|||
|
|||
# SegmentSealedImpl internal function definition |
|||
1. Most functions are the implementation of the corresponding functions of the segment interface, which will not be repeated here. |
|||
2. `update_row_count`: Used to update the row_count field. |
|||
3. `mask_with_timestamps`: Use Timestamp column to update search bitmask, used to support Time Travel function. |
@ -0,0 +1,23 @@ |
|||
# Time Travel Implementation (Segment Level) |
|||
Currently, there are two paths to implement time travel: |
|||
|
|||
1. Restrict with vec_count, used in growing segment. |
|||
2. Generate bitmask and combine it with DSL calculation results. It is mainly used in the sealed segment. |
|||
|
|||
## Growing Segment Time Travel |
|||
|
|||
1. When inserting, ensure that the inserted data is in ascending time order. |
|||
2. Find the location of timestamp with binary search and record it as vec_count. |
|||
3. Call vector_search interface and rest is handled inside segment, no need to handle bitmask generated by DSL. |
|||
|
|||
## SealedSegment Time Travel |
|||
|
|||
1. During load, data is placed in a continuous memory area named chunk, with the following properties: |
|||
1. Data is divided into multiple segments. |
|||
2. Data in one segment is ordered by primary key. |
|||
3. Data between Segments is in timestamp order. That is, the timestamp of every entity in the previous segment must be less than the timestamp of the first entity in the next segment. |
|||
|
|||
2. The Algorithm for time travel is: |
|||
1. Use get_active_count interface, find the last segment containing a legal ts, and return the last element position of this segment as vec_count. |
|||
2. Calculate the bitset mask with a timestamp. Due to the above properties, all the entities of the previous segment meet the conditions, and all the subsequent segments do not meet the conditions. Only the "last segment" needs to be calculated. |
|||
3. the calculated results of Bitset and DSL are combined and sent to vector search interface. |
@ -0,0 +1,15 @@ |
|||
# Visitor Pattern |
|||
Visitor Pattern is used in segcore for parse and execute Execution Plan. |
|||
|
|||
1. Inside `${internal/core}/src/query/PlanNode.h`, contains physical plan for vector search: |
|||
1. `FloatVectorANNS` FloatVector search execution node |
|||
2. `BinaryVectorANNS` BinaryVector search execution node |
|||
2. `${internal/core}/src/query/Expr.h` contains physical plan for scalar expression: |
|||
1. `TermExpr` support operation like `col in [1, 2, 3]` |
|||
2. `RangeExpr` support constant compare with data column like `a >= 5` `1 < b < 2` |
|||
3. `CompareExpr` support compare with different columns, like `a < b` |
|||
4. `LogicalBinaryExpr` support and/or |
|||
5. `LogicalUnaryExpr` support not |
|||
|
|||
Currently, under `${internal/core/src/query}` directory, there are the following visitors: |
|||
1. `ExecPlanNodeVistor` physical plan executor only supports ANNS node for now |
@ -0,0 +1,564 @@ |
|||
## Appendix A. Basic Components |
|||
|
|||
#### A.1 System Component |
|||
|
|||
Milvus has 9 different components and can be abstracted into basic Components. |
|||
|
|||
```go |
|||
type Component interface { |
|||
Init() error |
|||
Start() error |
|||
Stop() error |
|||
GetComponentStates(ctx context.Context) (*milvuspb.ComponentStates, error) |
|||
GetStatisticsChannel(ctx context.Context) (*milvuspb.StringResponse, error) |
|||
Register() error |
|||
} |
|||
``` |
|||
|
|||
- _GetComponentStates_ |
|||
|
|||
```go |
|||
|
|||
type StateCode = int |
|||
|
|||
const ( |
|||
INITIALIZING StateCode = 0 |
|||
HEALTHY StateCode = 1 |
|||
ABNORMAL StateCode = 2 |
|||
) |
|||
|
|||
type ComponentInfo struct { |
|||
NodeID UniqueID |
|||
Role string |
|||
StateCode StateCode |
|||
ExtraInfo []*commonpb.KeyValuePair |
|||
} |
|||
|
|||
type ComponentStates struct { |
|||
State *ComponentInfo |
|||
SubcomponentStates []*ComponentInfo |
|||
Status *commonpb.Status |
|||
} |
|||
|
|||
``` |
|||
|
|||
If a component needs to process timetick message to align timetick, it needs to implement the TimeTickProvider interface. |
|||
|
|||
```go |
|||
type TimeTickProvider interface { |
|||
GetTimeTickChannel(ctx context.Context) (*milvuspb.StringResponse, error) |
|||
} |
|||
``` |
|||
|
|||
#### A.2 Session |
|||
|
|||
###### ServerID |
|||
|
|||
The ID is stored in a key-value pair on etcd. The key is metaRootPath + "/session/id". The initial value is 0. When a service is registered, it is incremented by 1 and returned to the next registered service. |
|||
|
|||
###### Registration |
|||
|
|||
- Registration is achieved through etcd's lease mechanism. |
|||
|
|||
- The service creates a lease with etcd and stores a key-value pair in etcd. If the lease expires or the service goes offline, etcd will delete the key-value pair. You can judge whether this service is available through the key. |
|||
|
|||
- key: metaRoot + "/session" + "/ServerName(-ServerID)(optional)" |
|||
|
|||
- value: json format |
|||
|
|||
```json |
|||
{ |
|||
"ServerID": "ServerID", |
|||
"ServerName": "ServerName", |
|||
"Address": "ip:port", |
|||
"Exclusive": "Exclusive" |
|||
} |
|||
``` |
|||
|
|||
- By obtaining the address, you can establish a connection with other services |
|||
|
|||
- If a service is exclusive, the key will not have **ServerID**. But **ServerID** still will be stored in value. |
|||
|
|||
###### Discovery |
|||
|
|||
- All currently available services can be obtained by obtaining all the key-value pairs deposited during registration. If you want to get all the available nodes for a certain type of service, you can pass in the prefix of the corresponding key |
|||
|
|||
- Registration time can be compared with ServerID for ServerID will increase according to time. |
|||
|
|||
###### Interface |
|||
|
|||
```go |
|||
const ( |
|||
DefaultServiceRoot = "session/" |
|||
DefaultIDKey = "id" |
|||
DefaultRetryTimes = 30 |
|||
DefaultTTL = 60 |
|||
) |
|||
|
|||
// Session is a struct to store service's session, including ServerID, ServerName, |
|||
// Address. |
|||
// Exclusive indicates that this server can only start one. |
|||
type Session struct { |
|||
ctx context.Context |
|||
ServerID int64 `json:"ServerID,omitempty"` |
|||
ServerName string `json:"ServerName,omitempty"` |
|||
Address string `json:"Address,omitempty"` |
|||
Exclusive bool `json:"Exclusive,omitempty"` |
|||
} |
|||
|
|||
// NewSession is a helper to build Session object. |
|||
// ServerID, ServerName, Address, Exclusive will be assigned after registration. |
|||
// metaRoot is a path in etcd to save session information. |
|||
// etcdEndpoints is to init etcdCli when NewSession |
|||
func NewSession(ctx context.Context, metaRoot string, etcdEndpoints []string) *Session {} |
|||
|
|||
// Init will initialize base struct of the Session, including ServerName, ServerID, |
|||
// Address, Exclusive. ServerID is obtained in getServerID. |
|||
// Finally it will process keepAliveResponse to keep alive with etcd. |
|||
func (s *Session) Init(serverName, address string, exclusive bool) <-chan bool {} |
|||
|
|||
// GetSessions will get all sessions registered in etcd. |
|||
// Revision is returned for WatchServices to prevent key events from being missed. |
|||
func (s *Session) GetSessions(prefix string) (map[string]*Session, int64, error) {} |
|||
|
|||
// WatchServices watch the service's up and down in etcd, and send event to |
|||
// eventChannel. |
|||
// prefix is a parameter to know which service to watch and can be obtained in |
|||
// typeutil.type.go. |
|||
// revision is an etcd reversion to prevent missing key events and can be obtained |
|||
// in GetSessions. |
|||
// If a server up, an event will be added to channel with eventType SessionAddType. |
|||
// If a server down, an event will be added to channel with eventType SessionDelType. |
|||
func (s *Session) WatchServices(prefix string, revision int64) (eventChannel <-chan *SessionEvent) {} |
|||
``` |
|||
|
|||
#### A.3 Global Parameter Table |
|||
|
|||
```go |
|||
type BaseTable struct { |
|||
params *memkv.MemoryKV |
|||
} |
|||
|
|||
func (gp *BaseTable) Init() |
|||
func (gp *BaseTable) LoadFromKVPair(kvPairs []*commonpb.KeyValuePair) error |
|||
func (gp *BaseTable) Load(key string) (string, error) |
|||
func (gp *BaseTable) LoadRange(key, endKey string, limit int) ([]string, []string, error) |
|||
func (gp *BaseTable) LoadYaml(fileName string) error |
|||
func (gp *BaseTable) LoadYaml(fileName string) error |
|||
func (gp *BaseTable) LoadYaml(fileName string) error |
|||
func (gp *BaseTable) ParseFloat(key string) float64 |
|||
func (gp *BaseTable) ParseInt64(key string) int64 |
|||
func (gp *BaseTable) ParseInt32(key string) int32 |
|||
func (gp *BaseTable) ParseInt(key string) int |
|||
func (gp *BaseTable) WriteNodeIDList() []UniqueID |
|||
func (gp *BaseTable) DataNodeIDList() []UniqueID |
|||
func (gp *BaseTable) ProxyIDList() []UniqueID |
|||
func (gp *BaseTable) QueryNodeIDList() []UniqueID |
|||
``` |
|||
|
|||
- _LoadYaml(filePath string)_ turns a YAML file into multiple key-value pairs. For example, given the following YAML |
|||
|
|||
```yaml |
|||
etcd: |
|||
address: localhost |
|||
port: 2379 |
|||
rootpath: milvus/etcd |
|||
``` |
|||
|
|||
_BaseTable.LoadYaml_ will insert three key-value pairs into _params_ |
|||
|
|||
```go |
|||
"etcd.address" -> "localhost" |
|||
"etcd.port" -> "2379" |
|||
"etcd.rootpath" -> "milvus/etcd" |
|||
``` |
|||
|
|||
#### A.4 Time Ticked Flow Graph |
|||
|
|||
//TODO remove? |
|||
|
|||
###### A.4.1 Flow Graph States |
|||
|
|||
```go |
|||
type flowGraphStates struct { |
|||
startTick Timestamp |
|||
numActiveTasks map[string]int32 |
|||
numCompletedTasks map[string]int64 |
|||
} |
|||
``` |
|||
|
|||
###### A.4.2 Message |
|||
|
|||
```go |
|||
type Msg interface { |
|||
TimeTick() Timestamp |
|||
} |
|||
``` |
|||
|
|||
###### A.4.3 Node |
|||
|
|||
```go |
|||
type Node interface { |
|||
Name() string |
|||
MaxQueueLength() int32 |
|||
MaxParallelism() int32 |
|||
Operate(ctx context.Context, in []Msg) ([]Msg, context.Context) |
|||
IsInputNode() bool |
|||
Close() |
|||
} |
|||
``` |
|||
|
|||
```go |
|||
type BaseNode struct { |
|||
maxQueueLength int32 |
|||
maxParallelism int32 |
|||
} |
|||
``` |
|||
|
|||
###### A.4.4 Flow Graph |
|||
|
|||
```go |
|||
type nodeCtx struct { |
|||
node Node |
|||
inputChannels []chan Msg |
|||
inputMessages []Msg |
|||
downstream []*nodeCtx |
|||
downstreamInputChanIdx map[string]int |
|||
|
|||
NumActiveTasks int64 |
|||
NumCompletedTasks int64 |
|||
} |
|||
|
|||
func (nodeCtx *nodeCtx) Start(ctx context.Context) error |
|||
``` |
|||
|
|||
_Start()_ will enter a loop. In each iteration, it tries to collect input messages from _inputChan_, then prepares the node's input. When the input is ready, it will trigger _node.Operate_. When _node.Operate_ returns, it sends the returned _Msg_ to _outputChans_, which connects to the downstreams' _inputChans_. |
|||
|
|||
```go |
|||
type TimeTickedFlowGraph struct { |
|||
ctx context.Context |
|||
nodeCtx map[NodeName]*nodeCtx |
|||
} |
|||
|
|||
func (*pipeline TimeTickedFlowGraph) AddNode(node Node) |
|||
func (*pipeline TimeTickedFlowGraph) SetEdges(nodeName string, in []string, out []string) |
|||
func (*pipeline TimeTickedFlowGraph) Start() error |
|||
func (*pipeline TimeTickedFlowGraph) Close() error |
|||
|
|||
func NewTimeTickedFlowGraph(ctx context.Context) *TimeTickedFlowGraph |
|||
``` |
|||
|
|||
#### A.5 Allocator |
|||
|
|||
```go |
|||
type Allocator struct { |
|||
Ctx context.Context |
|||
CancelFunc context.CancelFunc |
|||
|
|||
wg sync.WaitGroup |
|||
|
|||
Reqs chan Request |
|||
ToDoReqs []Request |
|||
CanDoReqs []Request |
|||
SyncReqs []Request |
|||
|
|||
TChan TickerChan |
|||
ForceSyncChan chan Request |
|||
|
|||
SyncFunc func() bool |
|||
ProcessFunc func(req Request) error |
|||
|
|||
CheckSyncFunc func(timeout bool) bool |
|||
PickCanDoFunc func() |
|||
SyncErr error |
|||
Role string |
|||
} |
|||
func (ta *Allocator) Start() error |
|||
func (ta *Allocator) Init() error |
|||
func (ta *Allocator) Close() error |
|||
func (ta *Allocator) CleanCache() error |
|||
|
|||
``` |
|||
|
|||
#### A.6 ID Allocator |
|||
|
|||
```go |
|||
type IDAllocator struct { |
|||
Allocator |
|||
|
|||
rootCoordAddress string |
|||
rootCoord types.RootCoord |
|||
|
|||
countPerRPC uint32 |
|||
|
|||
idStart UniqueID |
|||
idEnd UniqueID |
|||
|
|||
PeerID UniqueID |
|||
} |
|||
|
|||
func (ia *IDAllocator) Start() error |
|||
func (ia *IDAllocator) AllocOne() (UniqueID, error) |
|||
func (ia *IDAllocator) Alloc(count uint32) (UniqueID, UniqueID, error) |
|||
|
|||
func NewIDAllocator(ctx context.Context, masterAddr string) (*IDAllocator, error) |
|||
``` |
|||
|
|||
#### A.6 Timestamp Allocator |
|||
|
|||
###### A.6.1 Timestamp |
|||
|
|||
Let's take a brief review of the Hybrid Logical Clock (HLC). HLC uses 64bits timestamps which are composed of a 46-bits physical component (thought of as and always close to local wall time) and an 18-bits logical component (used to distinguish between events with the same physical component). |
|||
|
|||
<img src="./figs/hlc.png" width=400> |
|||
|
|||
HLC's logical part is advanced on each request. The physical part can be increased in two cases: |
|||
|
|||
A. when the local wall time is greater than HLC's physical part, |
|||
|
|||
B. or the logical part overflows. |
|||
|
|||
In either case, the physical part will be updated, and the logical part will be set to 0. |
|||
|
|||
Keeping the physical part close to local wall time may face non-monotonic problems such as updates to POSIX time that could turn time backward. HLC avoids such problems, since if 'local wall time < HLC's physical part' holds, only case B is satisfied, thus monotonicity is guaranteed. |
|||
|
|||
Milvus does not support the transaction, but it should guarantee the deterministic execution of the multi-way WAL. The timestamp attached to each request should |
|||
|
|||
- have its physical part close to wall time (has an acceptable bounded error, a.k.a. uncertainty interval in transaction scenarios), |
|||
- and be globally unique. |
|||
|
|||
HLC leverages physical clocks at nodes that are synchronized using the NTP. NTP usually maintains time to within tens of milliseconds over local networks in the datacenter. Asymmetric routes and network congestion occasionally cause errors of hundreds of milliseconds. Both the normal time error and the spike are acceptable for Milvus use cases. |
|||
|
|||
The interface of Timestamp is as follows. |
|||
|
|||
``` |
|||
type timestamp struct { |
|||
physical uint64 // 18-63 bits |
|||
logical uint64 // 0-17 bits |
|||
} |
|||
|
|||
type Timestamp uint64 |
|||
``` |
|||
|
|||
###### A.6.2 Timestamp Oracle |
|||
|
|||
```go |
|||
type timestampOracle struct { |
|||
key string |
|||
txnkv kv.TxnKV |
|||
|
|||
saveInterval time.Duration |
|||
maxResetTSGap func() time.Duration |
|||
|
|||
TSO unsafe.Pointer |
|||
lastSavedTime atomic.Value |
|||
} |
|||
|
|||
func (t *timestampOracle) InitTimestamp() error |
|||
func (t *timestampOracle) ResetUserTimestamp(tso uint64) error |
|||
func (t *timestampOracle) UpdateTimestamp() error |
|||
func (t *timestampOracle) ResetTimestamp() |
|||
``` |
|||
|
|||
###### A.6.3 Timestamp Allocator |
|||
|
|||
```go |
|||
type TimestampAllocator struct { |
|||
Allocator |
|||
|
|||
rootCoordAddress string |
|||
rootCoordClient types.RootCoord |
|||
|
|||
countPerRPC uint32 |
|||
lastTsBegin Timestamp |
|||
lastTsEnd Timestamp |
|||
PeerID UniqueID |
|||
} |
|||
|
|||
func (ta *TimestampAllocator) Start() error |
|||
func (ta *TimestampAllocator) AllocOne() (UniqueID, error) |
|||
func (ta *TimestampAllocator) Alloc(count uint32) (UniqueID, UniqueID, error) |
|||
func (ta *TimestampAllocator) ClearCache() |
|||
|
|||
func NewTimestampAllocator(ctx context.Context, masterAddr string) (*TimestampAllocator, error) |
|||
``` |
|||
|
|||
- Batch Allocation of Timestamps |
|||
|
|||
- Expiration of Timestamps |
|||
|
|||
#### A.7 KV |
|||
|
|||
###### A.7.1 KV Base |
|||
|
|||
```go |
|||
type BaseKV interface { |
|||
Load(key string) (string, error) |
|||
MultiLoad(keys []string) ([]string, error) |
|||
LoadWithPrefix(key string) ([]string, []string, error) |
|||
Save(key, value string) error |
|||
MultiSave(kvs map[string]string) error |
|||
Remove(key string) error |
|||
MultiRemove(keys []string) error |
|||
RemoveWithPrefix(key string) error |
|||
|
|||
Close() |
|||
} |
|||
``` |
|||
|
|||
###### A.7.2 Txn Base |
|||
|
|||
```go |
|||
type TxnKV interface { |
|||
BaseKV |
|||
|
|||
MultiSaveAndRemove(saves map[string]string, removals []string) error |
|||
MultiRemoveWithPrefix(keys []string) error |
|||
MultiSaveAndRemoveWithPrefix(saves map[string]string, removals []string) error |
|||
} |
|||
``` |
|||
|
|||
###### A.7.3 MetaKv |
|||
|
|||
```go |
|||
// MetaKv is TxnKV for meta data. It should save data with lease. |
|||
type MetaKv interface { |
|||
TxnKV |
|||
GetPath(key string) string |
|||
LoadWithPrefix(key string) ([]string, []string, error) |
|||
CompareVersionAndSwap(key string, version int64, target string) error |
|||
WalkWithPrefix(prefix string, paginationSize int, fn func([]byte, []byte) error) error |
|||
} |
|||
|
|||
``` |
|||
|
|||
###### A.7.4 WatchKV |
|||
|
|||
```go |
|||
// WatchKV is watchable MetaKv. |
|||
// |
|||
//go:generate mockery --name=WatchKv --with-expecter |
|||
type WatchKV interface { |
|||
MetaKv |
|||
Watch(key string) clientv3.WatchChan |
|||
WatchWithPrefix(key string) clientv3.WatchChan |
|||
WatchWithRevision(key string, revision int64) clientv3.WatchChan |
|||
} |
|||
|
|||
``` |
|||
|
|||
###### A.7.5 SnapShotKv |
|||
|
|||
```go |
|||
// SnapShotKV is TxnKV for snapshot data. It must save timestamp. |
|||
type SnapShotKV interface { |
|||
Save(key string, value string, ts typeutil.Timestamp) error |
|||
Load(key string, ts typeutil.Timestamp) (string, error) |
|||
MultiSave(kvs map[string]string, ts typeutil.Timestamp, additions ...func(ts typeutil.Timestamp) (string, string, error)) error |
|||
LoadWithPrefix(key string, ts typeutil.Timestamp) ([]string, []string, error) |
|||
MultiSaveAndRemoveWithPrefix(saves map[string]string, removals []string, ts typeutil.Timestamp, additions ...func(ts typeutil.Timestamp) (string, string, error)) error |
|||
``` |
|||
|
|||
###### A.7.6 Etcd KV |
|||
|
|||
```go |
|||
type etcdKV struct { |
|||
client *clientv3.Client |
|||
rootPath string |
|||
} |
|||
|
|||
func (kv *etcdKV) Close() |
|||
func (kv *etcdKV) GetPath(key string) string |
|||
func (kv *etcdKV) LoadWithPrefix(key string) ([]string, []string, error) |
|||
func (kv *etcdKV) Load(key string) (string, error) |
|||
func (kv *etcdKV) GetCount(key string) (int64, error) |
|||
func (kv *etcdKV) MultiLoad(keys []string) ([]string, error) |
|||
func (kv *etcdKV) Save(key, value string) error |
|||
func (kv *etcdKV) MultiSave(kvs map[string]string) error |
|||
func (kv *etcdKV) RemoveWithPrefix(prefix string) error |
|||
func (kv *etcdKV) Remove(key string) error |
|||
func (kv *etcdKV) MultiRemove(keys []string) error |
|||
func (kv *etcdKV) MultiSaveAndRemove(saves map[string]string, removals []string) error |
|||
func (kv *etcdKV) Watch(key string) clientv3.WatchChan |
|||
func (kv *etcdKV) WatchWithPrefix(key string) clientv3.WatchChan |
|||
func (kv *etcdKV) WatchWithRevision(key string, revision int64) clientv3.WatchChan |
|||
|
|||
func NewEtcdKV(etcdAddr string, rootPath string) *etcdKV |
|||
``` |
|||
|
|||
etcdKV implements all _TxnKV_ interfaces. |
|||
|
|||
###### A.7.7 Memory KV |
|||
|
|||
```go |
|||
type MemoryKV struct { |
|||
sync.RWMutex |
|||
tree *btree.BTree |
|||
} |
|||
|
|||
func (s memoryKVItem) Less(than btree.Item) bool |
|||
func (kv *MemoryKV) Load(key string) (string, error) |
|||
func (kv *MemoryKV) LoadRange(key, endKey string, limit int) ([]string, []string, error) |
|||
func (kv *MemoryKV) Save(key, value string) error |
|||
func (kv *MemoryKV) Remove(key string) error |
|||
func (kv *MemoryKV) MultiLoad(keys []string) ([]string, error) |
|||
func (kv *MemoryKV) MultiSave(kvs map[string]string) error |
|||
func (kv *MemoryKV) MultiRemove(keys []string) error |
|||
func (kv *MemoryKV) MultiSaveAndRemove(saves map[string]string, removals []string) error |
|||
func (kv *MemoryKV) LoadWithPrefix(key string) ([]string, []string, error) |
|||
func (kv *MemoryKV) Close() |
|||
func (kv *MemoryKV) MultiRemoveWithPrefix(keys []string) error |
|||
func (kv *MemoryKV) MultiSaveAndRemoveWithPrefix(saves map[string]string, removals []string) error |
|||
``` |
|||
|
|||
MemoryKV implements all _TxnKV_ interfaces. |
|||
|
|||
###### A.7.8 MinIO KV |
|||
|
|||
```go |
|||
type MinIOKV struct { |
|||
ctx context.Context |
|||
minioClient *minio.Client |
|||
bucketName string |
|||
} |
|||
|
|||
func (kv *MinIOKV) LoadWithPrefix(key string) ([]string, []string, error) |
|||
func (kv *MinIOKV) Load(key string) (string, error) |
|||
func (kv *MinIOKV) MultiLoad(keys []string) ([]string, error) |
|||
func (kv *MinIOKV) Save(key, value string) error |
|||
func (kv *MinIOKV) MultiSave(kvs map[string]string) error |
|||
func (kv *MinIOKV) RemoveWithPrefix(key string) error |
|||
func (kv *MinIOKV) Remove(key string) error |
|||
func (kv *MinIOKV) MultiRemove(keys []string) error |
|||
func (kv *MinIOKV) Close() |
|||
``` |
|||
|
|||
MinIOKV implements all _KV_ interfaces. |
|||
|
|||
###### A.7.9 RocksdbKV KV |
|||
|
|||
```go |
|||
type RocksdbKV struct { |
|||
opts *gorocksdb.Options |
|||
db *gorocksdb.DB |
|||
writeOptions *gorocksdb.WriteOptions |
|||
readOptions *gorocksdb.ReadOptions |
|||
name string |
|||
} |
|||
|
|||
func (kv *RocksdbKV) Close() |
|||
func (kv *RocksdbKV) GetName() string |
|||
func (kv *RocksdbKV) Load(key string) (string, error) |
|||
func (kv *RocksdbKV) LoadWithPrefix(key string) ([]string, []string, error) |
|||
func (kv *RocksdbKV) MultiLoad(keys []string) ([]string, error) |
|||
func (kv *RocksdbKV) Save(key, value string) error |
|||
func (kv *RocksdbKV) MultiSave(kvs map[string]string) error |
|||
func (kv *RocksdbKV) RemoveWithPrefix(key string) error |
|||
func (kv *RocksdbKV) Remove(key string) error |
|||
func (kv *RocksdbKV) MultiRemove(keys []string) error |
|||
func (kv *RocksdbKV) MultiSaveAndRemove(saves map[string]string, removals []string) error |
|||
func (kv *RocksdbKV) MultiRemoveWithPrefix(keys []string) error |
|||
func (kv *RocksdbKV) MultiSaveAndRemoveWithPrefix(saves map[string]string, removals []string) error |
|||
``` |
|||
|
|||
RocksdbKV implements all _TxnKV_ interfaces.h |
@ -0,0 +1,964 @@ |
|||
## Appendix B. API Reference |
|||
|
|||
In this section, we introduce the RPCs of milvus service. A brief description of the RPCs is listed as follows. |
|||
|
|||
| RPC | description | |
|||
| :---------------------- | --------------------------------------------------------------------------------------------- | |
|||
| CreateCollection | create a collection based on schema statement | |
|||
| DropCollection | drop a collection | |
|||
| HasCollection | check whether a collection exists | |
|||
| LoadCollection | load collection to memory for future search | |
|||
| ReleaseCollection | release the collection from memory | |
|||
| DescribeCollection | show a collection's schema and its descriptive statistics | |
|||
| GetCollectionStatistics | show a collection's statistics | |
|||
| ShowCollections | list all collections | |
|||
| CreatePartition | create a partition | |
|||
| DropPartition | drop a partition | |
|||
| HasPartition | check whether a partition exists | |
|||
| LoadPartition | load partition to memory for future search | |
|||
| ReleasePartitions | release partitions from memory | |
|||
| GetPartitionStatistics | show a partition's statistics | |
|||
| ShowPartitions | list a collection's all partitions | |
|||
| CreateIndex | create an index for a field in the collection | |
|||
| DescribeIndex | get index details for a field in the collection | |
|||
| GetIndexStates | get build index state | |
|||
| DropIndex | drop a specific index for a field in the collection | |
|||
| Insert | insert a batch of rows into a collection or a partition | |
|||
| Search | query the columns of a collection or a partition with ANNS statements and boolean expressions | |
|||
| Flush | Perform persistent storage of data in memory | |
|||
|
|||
**MsgBase** is a base struct in each request. |
|||
|
|||
```protobuf |
|||
message MsgBase { |
|||
MsgType msg_type = 1; |
|||
int64 msgID = 2; |
|||
uint64 timestamp = 3; |
|||
int64 sourceID = 4; |
|||
} |
|||
``` |
|||
|
|||
**MsgType** is the enum to distinguish different message types in message queue, such as insert msg, search msg, etc. **msgID** is the unique id identifier of message. **timestamp** is the time when this message was generated. **sourceID** is the unique id identifier of the source. |
|||
|
|||
#### 3.1 Definition Requests |
|||
|
|||
###### 3.1.1 CreateCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc CreateCollection(CreateCollectionRequest) returns (common.Status){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Create a collection through CreateCollectionRequest. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **CreateCollectionRequest** |
|||
|
|||
CreateCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message CreateCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
// `schema` is the serialized `schema.CollectionSchema` |
|||
bytes schema = 4; |
|||
} |
|||
|
|||
message CollectionSchema { |
|||
string name = 1; |
|||
string description = 2; |
|||
bool autoID = 3; |
|||
repeated FieldSchema fields = 4; |
|||
} |
|||
``` |
|||
|
|||
CreateCollectionRequest contains **MsgBase**, **db_name**, **collection_name** and serialized collection schema **schema**. **db_name** contains only a string named **collection_name**. Collection with the same collection_name is going to be created. |
|||
|
|||
Collection schema contains all the base information of a collection including **collection name**, **description**, **autoID** and **fields**. Collection description is defined by the database manager to describe the collection. **autoID** determines whether the ID of each row of data is user-defined. If **autoID** is true, our system will generate a unique ID for each data. If **autoID** is false, users need to give each entity an ID when inserting. |
|||
|
|||
**Fields** is a list of **FieldSchema**. Each schema should include Field **name**, **description**, **dataType**, **type_params** and **index_params**. |
|||
|
|||
FieldSchema struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message FieldSchema { |
|||
int64 fieldID = 1; |
|||
string name = 2; |
|||
bool is_primary_key = 3; |
|||
string description = 4; |
|||
DataType data_type = 5; |
|||
repeated common.KeyValuePair type_params = 6; |
|||
repeated common.KeyValuePair index_params = 7; |
|||
} |
|||
``` |
|||
|
|||
**Field schema** contains all the base information of a field including **fieldID**, **name**, **description**, **data_type**, **type_params** and **index_params**. **data_type** is an enum type to distinguish different data types. Total enum is shown in the last of this doc |
|||
|
|||
**type_params** contains the detailed information of data_type. For example, vector data type should include dimension information. You can give a pair of <dim, 8> to let the field store an 8-dimension vector. |
|||
|
|||
**index_params**:For fast search, you build an index for the field. You specify detailed index information for a field. Detailed information about index can be seen in chapter 2.2.3 |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.2 DropCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc DropCollection(DropCollectionRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to delete collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **DropCollectionRequest** |
|||
|
|||
DropCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message DropCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
Collection with the same **collection_name** is going to be deleted. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.3 HasCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc HasCollection(HasCollectionRequest) returns (BoolResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to test collection existence. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **HasCollectionRequest** |
|||
|
|||
HasCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message HasCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
The server finds the collection through **collection_name** and checks whether the collection exists. |
|||
|
|||
**Returns:** |
|||
|
|||
- **BoolResponse** |
|||
|
|||
```protobuf |
|||
message BoolResponse { |
|||
common.Status status = 1; |
|||
bool value = 2; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**value** represents whether the collection exists. If collection exists, value will be true. If collection doesn't exist, value will be false. |
|||
|
|||
###### 3.1.4 LoadCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc LoadCollection(LoadCollectionRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to load collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **LoadCollectionRequest** |
|||
|
|||
LoadCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message LoadCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
Collection with the same **collection_name** is going to be loaded to memory. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.5 ReleaseCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc ReleaseCollection(ReleaseCollectionRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to release collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **ReleaseCollectionRequest** |
|||
|
|||
ReleaseCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message ReleaseCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
Collection with the same **collection_name** is going to be released from memory. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.6 DescribeCollection |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc DescribeCollection(DescribeCollectionRequest) returns (CollectionDescription) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to get collection schema. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **DescribeCollectionRequest** |
|||
|
|||
DescribeCollectionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message DescribeCollectionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
int64 collectionID = 4; |
|||
} |
|||
``` |
|||
|
|||
The server finds the collection through **collection_name** and gets detailed collection information. And **collectionID** is for internal component to get collection details. |
|||
|
|||
**Returns:** |
|||
|
|||
- **DescribeCollectionResponse** |
|||
|
|||
```protobuf |
|||
message DescribeCollectionResponse { |
|||
common.Status status = 1; |
|||
schema.CollectionSchema schema = 2; |
|||
int64 collectionID = 3; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**schema** is collection schema same as the collection schema in [CreateCollection](#311-createcollection). |
|||
|
|||
###### 3.1.7 GetCollectionStatistics |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc GetCollectionStatistics(GetCollectionStatisticsRequest) returns (GetCollectionStatisticsResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to get collection statistics. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **GetCollectionStatisticsRequest** |
|||
|
|||
GetCollectionStatisticsRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message GetCollectionStatisticsRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
} |
|||
``` |
|||
|
|||
The server finds the collection through **collection_name** and gets detailed collection statistics. |
|||
|
|||
**Returns:** |
|||
|
|||
- **GetCollectionStatisticsResponse** |
|||
|
|||
```protobuf |
|||
message GetCollectionStatisticsResponse { |
|||
common.Status status = 1; |
|||
repeated common.KeyValuePair stats = 2; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the error details. |
|||
|
|||
**stats** is a map saving different statistics. For example, you can get row_count of a collection with key 'row_count'. |
|||
|
|||
###### 3.1.8 ShowCollections |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc ShowCollections(ShowCollectionsRequest) returns (ShowCollectionsResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to get collection schema. |
|||
|
|||
**Parameters:** None |
|||
|
|||
**Returns:** |
|||
|
|||
- **ShowCollectionsResponse** |
|||
|
|||
```protobuf |
|||
message ShowCollectionsResponse { |
|||
common.Status status = 1; |
|||
repeated string collection_names = 2; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the error details. |
|||
|
|||
**collection_names** is a list contains all collections' names. |
|||
|
|||
###### 3.1.9 CreatePartition |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc CreatePartition(CreatePartitionRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to create a partition |
|||
|
|||
**Parameters:** |
|||
|
|||
- **CreatePartitionRequest** |
|||
|
|||
CreatePartitionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message CreatePartitionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string partition_name = 4; |
|||
} |
|||
``` |
|||
|
|||
The server creates partition with the **partition_name** in collection with name of **collection_name** |
|||
|
|||
- **Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.10 DropPartition |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc DropPartition(DropPartitionRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to drop partition. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **DropPartitionRequest** |
|||
|
|||
DropPartitionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message DropPartitionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string partition_name = 4; |
|||
} |
|||
``` |
|||
|
|||
Drop partition with the same **partition_name** in collection with **collection_name** is going to be deleted. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.11 HasPartition |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc HasPartition(HasPartitionRequest) returns (BoolResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to test partition existence. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **HasPartitionRequest** |
|||
|
|||
HasPartitionRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message HasPartitionRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string partition_name = 4; |
|||
} |
|||
``` |
|||
|
|||
Partition with the same **partition_name** is going to be tested whether it is in collection with **collection_name**. |
|||
|
|||
**Returns:** |
|||
|
|||
- **BoolResponse** |
|||
|
|||
```protobuf |
|||
message BoolResponse { |
|||
common.Status status = 1; |
|||
bool value = 2; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**value** represents whether the partition exists. If partition exists, value will be true. If partition doesn't exist, value will be false. |
|||
|
|||
###### 3.1.12 LoadPartitions |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc LoadPartitions(LoadPartitionsRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to load collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **LoadPartitionsRequest** |
|||
|
|||
LoadPartitionsRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message LoadPartitionsRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
repeated string partition_names = 4; |
|||
} |
|||
``` |
|||
|
|||
**partition_names** is a list of partition_name. These partitions in collection with the **collection_name** are going to be loaded to memory. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.13 ReleasePartitions |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc ReleasePartitions(ReleasePartitionsRequest) returns (common.Status) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to release partition. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **ReleasePartitionsRequest** |
|||
|
|||
ReleasePartitionsRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message ReleasePartitionsRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
repeated string partition_names = 4; |
|||
} |
|||
``` |
|||
|
|||
**partition_names** is a list of partition_name. These partitions in collection with the **collection_name** are going to be released from memory. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
###### 3.1.14 GetPartitionStatistics |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc GetPartitionStatistics(GetPartitionStatisticsRequest) returns (GetPartitionStatisticsResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to get partition statistics. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **GetPartitionStatisticsRequest** |
|||
|
|||
GetPartitionStatisticsRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message GetPartitionStatisticsRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string partition_name = 4; |
|||
} |
|||
``` |
|||
|
|||
The server finds the partition through **partition_name** in collection with **collection_name** and gets detailed partition statistics. |
|||
|
|||
**Returns:** |
|||
|
|||
- **GetCollectionStatisticsResponse** |
|||
|
|||
```protobuf |
|||
message GetPartitionStatisticsResponse { |
|||
common.Status status = 1; |
|||
repeated common.KeyValuePair stats = 2; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**stats** is a map saving different statistics. For example, you can get row_count of a partition with key 'row_count'. |
|||
|
|||
###### 3.1.15 ShowPartitions |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc ShowPartitions(ShowPartitionsRequest) returns (StringListResponse) {} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
This method is used to get partition descriptions. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **ShowPartitionsRequest** |
|||
|
|||
ShowPartitionsRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message ShowPartitionsRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
int64 collectionID = 4; |
|||
} |
|||
``` |
|||
|
|||
Partitions in the collection with **collection_name** are going to be listed. |
|||
|
|||
**Returns:** |
|||
|
|||
- **StringListResponse** |
|||
|
|||
```protobuf |
|||
message ShowPartitionsResponse { |
|||
common.Status status = 1; |
|||
repeated string partition_names = 2; |
|||
repeated int64 partitionIDs = 3; |
|||
} |
|||
``` |
|||
|
|||
**status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**partition_names** is a list contains all partitions' name. |
|||
**partitionIDs** is a list contains all partitions' ids. And the index of a partition in **partition_names** and **partitionIDs** are same. |
|||
|
|||
#### 3.2 Manipulation Requests |
|||
|
|||
###### 3.2.1 Insert |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc Insert(InsertRequest) returns (InsertResponse){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Insert a batch of rows into a collection or a partition |
|||
|
|||
**Parameters:** |
|||
|
|||
- **InsertRequest** |
|||
|
|||
InsertRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message InsertRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string partition_name = 4; |
|||
repeated common.Blob row_data = 5; |
|||
repeated uint32 hash_keys = 6; |
|||
} |
|||
|
|||
message Blob { |
|||
bytes value = 1; |
|||
} |
|||
``` |
|||
|
|||
Insert a batch of **row_data** into collection with **collection_name** and partition with **partition_name**. Blob contains bytes of value. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message InsertResponse { |
|||
common.Status status = 1; |
|||
int64 rowID_begin = 2; |
|||
int64 rowID_end = 3; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**rowID_begin** and **rowID_end** are the ID of inserted values. |
|||
|
|||
###### 3.2.2 Delete |
|||
|
|||
- DeleteByID |
|||
|
|||
#### 3.3 Query |
|||
|
|||
#### 3.3 Index |
|||
|
|||
###### 3.3.1 CreateIndex |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc CreateIndex(CreateIndexRequest) returns (common.Status){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Create an index for a collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **CreateIndexRequest** |
|||
|
|||
CreateIndexRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message CreateIndexRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string field_name = 4; |
|||
repeated common.KeyValuePair extra_params = 5; |
|||
} |
|||
``` |
|||
|
|||
CreateIndex for the field with **field_name** in collection with **collection_name**. |
|||
|
|||
**extra_params**:For fast search, you build index for field. You specify detailed index information for a field. Detailed information about index can be seen in chapter 2.2.3 |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe error detail. |
|||
|
|||
###### 3.3.2 DescribeIndex |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc DescribeIndex(DescribeIndexRequest) returns (common.Status){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Get an index detailed info |
|||
|
|||
**Parameters:** |
|||
|
|||
- **DescribeIndexRequest** |
|||
|
|||
DescribeIndexRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message DescribeIndexRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string field_name = 4; |
|||
string index_name = 5; |
|||
} |
|||
``` |
|||
|
|||
Get details of an index for the field with **field_name** in collection with **collection_name**. |
|||
|
|||
**index_name**: A field can create multiple indexes. And you can drop a specific index through index_name. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message DescribeIndexResponse { |
|||
common.Status status = 1; |
|||
repeated IndexDescription index_descriptions = 2; |
|||
} |
|||
|
|||
message IndexDescription { |
|||
string index_name = 1; |
|||
int64 indexID = 2; |
|||
repeated common.KeyValuePair params = 3; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe error detail. |
|||
|
|||
**index_descriptions** is a list of index descriptions. If index_name is specific in request, the list length will be 0. Otherwise, if index_name is empty, the response will return all indexes in the field of a collection. |
|||
|
|||
**params**:For fast search, you build index for field. You specify detailed index information for a field. Detailed information about index can be seen in chapter 2.2.3 |
|||
|
|||
###### 3.3.3 GetIndexStates |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc GetIndexStates(GetIndexStatesRequest) returns (GetIndexStatesRequest){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Get index build progress info. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **GetIndexStatesRequest** |
|||
|
|||
GetIndexStatesRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message GetIndexStatesRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string field_name = 4; |
|||
string index_name = 5; |
|||
} |
|||
``` |
|||
|
|||
Get index build progress info for the field with **field_name** in collection with **collection_name**. |
|||
|
|||
**index_name**: A field can create multiple indexes. And you can get specific index state through index_name. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message GetIndexStatesResponse { |
|||
common.Status status = 1; |
|||
common.IndexState state = 2; |
|||
} |
|||
|
|||
enum IndexState { |
|||
IndexStateNone = 0; |
|||
Unissued = 1; |
|||
InProgress = 2; |
|||
Finished = 3; |
|||
Failed = 4; |
|||
Deleted = 5; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
|||
|
|||
**index state** is an enum type to distinguish the different processes in the index building process. |
|||
|
|||
###### 3.3.4 DropIndex |
|||
|
|||
**Interface:** |
|||
|
|||
``` |
|||
rpc DropIndex(DropIndexRequest) returns (common.Status){} |
|||
``` |
|||
|
|||
**Description:** |
|||
|
|||
Drop an index for a collection. |
|||
|
|||
**Parameters:** |
|||
|
|||
- **DropIndexRequest** |
|||
|
|||
DropIndexRequest struct is shown as follows: |
|||
|
|||
```protobuf |
|||
message DropIndexRequest { |
|||
common.MsgBase base = 1; |
|||
string db_name = 2; |
|||
string collection_name = 3; |
|||
string field_name = 4; |
|||
string index_name = 5; |
|||
} |
|||
``` |
|||
|
|||
DropIndex for the field with **field_name** in collection with **collection_name**. |
|||
|
|||
**index_name**: A field can create multiple indexes. And you can drop specific index through index_name. |
|||
|
|||
**Returns:** |
|||
|
|||
- **common.Status** |
|||
|
|||
```protobuf |
|||
message Status { |
|||
ErrorCode error_code = 1; |
|||
string reason = 2; |
|||
} |
|||
``` |
|||
|
|||
**Status** represents the server error code. It doesn't contain grpc error but contains the server error code. We can get the executing result in common status. **error_code** is an enum type to distinguish the executing error type. The total Errorcode is shown in the last of this code. And the **reason** field is a string to describe the detailed error. |
@ -0,0 +1,104 @@ |
|||
## 系统配置 |
|||
|
|||
Milvus 能够通过配置文件、命令行选项、环境变量进行配置。 |
|||
|
|||
优先级顺序: 命令行选项 > 环境变量 > 配置文件 > 默认值 |
|||
|
|||
如果提供了配置文件,则其他的命令行选项和环境变量都将被忽略。 |
|||
例如: `milvus run rootcoord --config-file milvus.yaml --log-level debug` 将忽略 `--log-level` 选项。 |
|||
|
|||
### 语法 |
|||
|
|||
在控制台中使用以下语法运行 `milvus` 命令: |
|||
|
|||
```shell |
|||
$ milvus [command] [server type] [flags] |
|||
``` |
|||
|
|||
例如: |
|||
|
|||
```shell |
|||
$ MILVUS_CONFIG_FILE=/path/to/milvus/configs/milvus.yaml milvus run rootcoord |
|||
``` |
|||
|
|||
`command`, `server type`, `flags` 分别表示为 |
|||
|
|||
`command`: 指定要在程序上执行的操作。例如: `run`,`stop` |
|||
|
|||
`server type`:指定执行程序的类型。`server type` 有: |
|||
|
|||
- `rootcoord` |
|||
- `proxy` |
|||
- `querycoord` |
|||
- `querynode` |
|||
- `datacoord` |
|||
- `datanode` |
|||
- `indexcoord` |
|||
- `indexnode` |
|||
- `standalone` |
|||
- `mixture` |
|||
|
|||
`flags`:指定命令行选项。例如,你可以使用 `-f` 或者 `--config-file` 选项去指定配置文件路径。 |
|||
|
|||
当 `server type` 为 `mixture` 时,必须附加以下几个 `flag` 中的一个或多个,表示这几个服务在一个进程内启动 |
|||
|
|||
- `-rootcoord` |
|||
- `-querycoord` |
|||
- `-datacoord` |
|||
- `-indexcoord` |
|||
|
|||
> Getting help |
|||
> |
|||
> You can get help for CLI tool using the `--help` flag, or `-h` for short. |
|||
> |
|||
> ```shell |
|||
> $ milvus run rootcoord --help |
|||
> ``` |
|||
|
|||
### 命令行参数 |
|||
|
|||
**--version** |
|||
|
|||
- 打印系统版本号和组件名并退出 |
|||
|
|||
**--config-check** |
|||
|
|||
- 检查配置文件的有效性并退出 |
|||
- 默认:false |
|||
|
|||
**--config-file** |
|||
|
|||
- 从文件中加载系统配置。如果设置了配置文件,则其他的命令行选项和环境变量都将被忽略。 |
|||
- 默认值: "" |
|||
- 环境变量:MILVUS_CONFIG_FILE |
|||
|
|||
**--log-level** |
|||
|
|||
- 指定日志的输出级别。当前支持 `debug`,`info`,`warning`,`error` |
|||
- 默认值:"info" |
|||
- 环境变量:"MILVUS_LOG_LEVEL" |
|||
|
|||
**--log-path** |
|||
|
|||
- 指定日志的存储路径。 |
|||
- 默认值:"/var/lib/milvus/logs" |
|||
- 环境变量:"MILVUS_LOG_PATH" |
|||
|
|||
### 配置文件描述 |
|||
|
|||
配置文件比命令行参数支持更多的选项。你可以根据 milvus.yaml.sample 文件按照需要创建一个新的配置文件 milvus.yaml 即可。 |
|||
|
|||
| 名称 | 描述 | 默认值 | |
|||
| --------------------------------- | ----------------------------------------------------------------------------------------------------------- | ---------------------- | |
|||
| etcd.endpoints | etcd 服务接入端 | "localhost:2379" | |
|||
| minio.address | minio 服务地址 | "localhost" | |
|||
| minio.port | minio 服务端口 | 9000 | |
|||
| pulsar.address | pulsar 服务地址 | "localhost" | |
|||
| pulsar.port | pulsar 服务端口 | 6650 | |
|||
| log.level | 指定日志的输出级别。当前支持 `debug`,`info`,`warning`,`error` | "info" | |
|||
| log.format | 指定日志的输出格式。当前支持 `text` 和 `json` | "text" | |
|||
| log.file.rootPath | 指定日志的存储路径 | "/var/lib/milvus/logs" | |
|||
| log.file.maxSize | 日志文件的大小限制 | 300MB | |
|||
| log.file.maxAge | 日志最大保留的天数。默认不清理旧的日志文件。如果设置该参数值,则会清理 `maxAge` 天前的日志文件。 | 0 | |
|||
| log.file.maxBackups | 保留日志文件的最大数量。默认保留所有旧的日志文件。如果设置该参数值为 `7`,则最多会保留 `7` 个旧的日志文件。 | 0 | |
|||
| msgChannel.chanNamePrefix.cluster | 指定 pulsar 中 topic 前缀 | "by-dev" | |
@ -0,0 +1,37 @@ |
|||
## Appendix D. Error Code |
|||
|
|||
**ErrorCode** |
|||
|
|||
```protobuf |
|||
enum ErrorCode { |
|||
Success = 0; |
|||
UnexpectedError = 1; |
|||
ConnectFailed = 2; |
|||
PermissionDenied = 3; |
|||
CollectionNotExists = 4; |
|||
IllegalArgument = 5; |
|||
IllegalDimension = 7; |
|||
IllegalIndexType = 8; |
|||
IllegalCollectionName = 9; |
|||
IllegalTOPK = 10; |
|||
IllegalRowRecord = 11; |
|||
IllegalVectorID = 12; |
|||
IllegalSearchResult = 13; |
|||
FileNotFound = 14; |
|||
MetaFailed = 15; |
|||
CacheFailed = 16; |
|||
CannotCreateFolder = 17; |
|||
CannotCreateFile = 18; |
|||
CannotDeleteFolder = 19; |
|||
CannotDeleteFile = 20; |
|||
BuildIndexError = 21; |
|||
IllegalNLIST = 22; |
|||
IllegalMetricType = 23; |
|||
OutOfMemory = 24; |
|||
IndexNotExist = 25; |
|||
EmptyCollection = 26; |
|||
|
|||
// internal error code. |
|||
DDRequestRace = 1000; |
|||
} |
|||
``` |
@ -0,0 +1 @@ |
|||
## Appendix E. Statistics |
@ -0,0 +1,57 @@ |
|||
## 1. System Overview |
|||
|
|||
In this section, we sketch the system design of Milvus, including the data model, data organization, architecture, and state synchronization. |
|||
|
|||
#### 1.1 Data Model |
|||
|
|||
Milvus exposes the following set of data features to applications: |
|||
|
|||
- a data model based on schematized relational tables, in that rows must have primary keys, |
|||
|
|||
- a query language specifies data definition, data manipulation, and data query, where data definition includes create, drop, and data manipulation includes insert, upsert, delete, and data query falls into three types, primary key search, approximate nearest neighbor search (ANNS), ANNS with predicates. |
|||
|
|||
The requests' execution order is strictly in accordance with their issue-time order. We take Proxy's issue time as a request's issue time. For a batch request, all its sub-requests share the same issue time. In cases there are multiple proxies, issue time from different proxies are regarded as coming from a central clock. |
|||
|
|||
The transaction is currently not supported by Milvus. |
|||
|
|||
A batch insert/delete is guaranteed to become visible atomically. |
|||
|
|||
#### 1.2 Data Organization |
|||
|
|||
<img src="./figs/data_organization.png" width=550> |
|||
|
|||
In Milvus, 'collection' refers to the concept of a table. A collection can be optionally divided into several 'partitions'. Both collection and partition are the basic execution scopes of queries. When using the partition, users should know how a collection should be partitioned. In most cases, partition leads to more flexible data management and more efficient querying. For a partitioned collection, queries can be executed on the collection or a set of specified partitions. |
|||
|
|||
Each collection or partition contains a set of 'segment groups'. The Segment group is the basic unit of data-to-node mapping. It's also the basic unit of a replica. For instance, if a query node failed, its segment groups will be redistributed across the other nodes. If a query node is overloaded, part of its segment groups will be migrated to underloaded ones. If a hot collection/partition is detected, its segment groups will be replicated to smooth the system load skewness. |
|||
|
|||
'Segment' is the finest unit of data organization. It is where the data and indexes are actually kept. Each segment contains a set of rows. In order to reduce the memory footprint during query execution and to fully utilize SIMD, the physical data layout within segments is organized in a column-based manner. |
|||
|
|||
#### 1.3 Architecture Overview |
|||
|
|||
<img src="./figs/system_framework.png" width=800> |
|||
|
|||
The main components, Proxy, WAL, query node, and write node can scale to multiple instances. These components scale separately for a better tradeoff between availability and cost. |
|||
|
|||
The WAL forms a hash ring. Requests (i.e. inserts and deletes) from clients will be repacked by Proxy. Operations shared the identical hash value (the hash value of primary key) will be routed to the same hash bucket. In addition, some preprocessing work will be done by Proxy, such as static validity checking, primary key assignment (if not given by the user), timestamp assignment. |
|||
|
|||
The query/write nodes are linked to the hash ring, with each node covering some portion of the buckets. Once the hash function and bucket coverage are settled, the chain 'proxy -> WAL -> query/write node' will act as a producer-consumer pipeline. Logs in each bucket are a determined operation stream. Via performing the operation stream in order, the query nodes keep themselves up to date. |
|||
|
|||
The query nodes hold all the indexes in memory. Since building an index is time-consuming, the query nodes will dump their index to disk (store engine) for fast failure recovery and cross node index copy. |
|||
|
|||
The write nodes are stateless. They simply transform the newly arrived WALs to binlog format, then append the binlog to the store engine. |
|||
|
|||
Note that not all the components are necessarily replicated. The system provides failure tolerance by maintaining multiple copies of WAL and binlog. When there is no in-memory index replica and there occurs a query node failure, other query nodes will take over its indexes by loading the dumped index files, or rebuilding them from binlog and WALs. The links from query nodes to the hash ring will also be adjusted such that the failed node's input WAL stream can be properly handled by its neighbors. |
|||
|
|||
#### 1.4 State Synchronization |
|||
|
|||
<img src="./figs/state_sync.png" width=800> |
|||
|
|||
Data in Milvus have three different forms, namely WAL, binlog, and index. As mentioned in the previous section, WAL can be viewed as a determined operation stream. The other two data forms keep themselves up to date by performing the operation stream in time order. |
|||
|
|||
Each of the WAL is attached with a timestamp, which is the time when the log is sent to the hash bucket. Binlog records, table rows, index cells will also keep that timestamp. In this way, different data forms can offer consistent snapshots for a given time T. For example, requests such as "fetch binlogs before T for point-in-time recovery", "get the row with primary key K at time T", "launch a similarity search at time T for vector V" perform on binlog, index respectively. Though different data forms of these three requests are performed, they observe identical snapshots, namely all the state changes before T. |
|||
|
|||
For better throughput, Milvus allows asynchronous state synchronization between WAL and index/binlog/table. Whenever the data is not fresh enough to satisfy a query, the query will be suspended until the data is up-to-date, or timeout will be returned. |
|||
|
|||
#### 1.5 Stream and Time |
|||
|
|||
In order to boost throughput, we model Milvus as a stream-driven system. |
@ -0,0 +1,476 @@ |
|||
## 2. Schema |
|||
|
|||
#### 2.1 Collection Schema |
|||
|
|||
```go |
|||
type CollectionSchema struct { |
|||
Name string |
|||
Description string |
|||
AutoId bool |
|||
Fields []*FieldSchema |
|||
} |
|||
``` |
|||
|
|||
#### 2.2 Field Schema |
|||
|
|||
```go |
|||
type FieldSchema struct { |
|||
FieldID int64 |
|||
Name string |
|||
IsPrimaryKey bool |
|||
Description string |
|||
DataType DataType |
|||
TypeParams []*commonpb.KeyValuePair |
|||
IndexParams []*commonpb.KeyValuePair |
|||
AutoID bool |
|||
} |
|||
``` |
|||
|
|||
###### 2.2.1 Data Types |
|||
|
|||
**DataType** |
|||
|
|||
```protobuf |
|||
enum DataType { |
|||
NONE = 0; |
|||
BOOL = 1; |
|||
INT8 = 2; |
|||
INT16 = 3; |
|||
INT32 = 4; |
|||
INT64 = 5; |
|||
|
|||
FLOAT = 10; |
|||
DOUBLE = 11; |
|||
|
|||
STRING = 20; |
|||
|
|||
VECTOR_BINARY = 100; |
|||
VECTOR_FLOAT = 101; |
|||
} |
|||
``` |
|||
|
|||
###### 2.2.2 Type Params |
|||
|
|||
###### 2.2.3 Index Params |
|||
|
|||
# Intro to Index |
|||
|
|||
For more detailed information about indexes, please refer to [Milvus documentation index chapter.](https://milvus.io/docs/index.md) |
|||
|
|||
To learn how to choose an appropriate index for your application scenarios, please read [How to Select an Index in Milvus](https://medium.com/@milvusio/how-to-choose-an-index-in-milvus-4f3d15259212). |
|||
|
|||
To learn how to choose an appropriate index for a metric, see [Similarity Metrics](https://milvus.io/docs/metric.md). |
|||
|
|||
Different index types use different index params in construction and query. All index params are represented by the structure of the map. This doc shows the map code in python. |
|||
|
|||
[IVF_FLAT](#IVF_FLAT) |
|||
[BIN_IVF_FLAT](#BIN_IVF_FLAT) |
|||
[IVF_PQ](#IVF_PQ) |
|||
[IVF_SQ8](#IVF_SQ8) |
|||
[IVF_SQ8_HYBRID](#IVF_SQ8_HYBRID) |
|||
[ANNOY](#ANNOY) |
|||
[HNSW](#HNSW) |
|||
[RHNSW_PQ](#RHNSW_PQ) |
|||
[RHNSW_SQ](#RHNSW_SQ) |
|||
[NSG](#NSG) |
|||
|
|||
## IVF_FLAT |
|||
|
|||
**IVF** (_Inverted File_) is an index type based on quantization. It divides the points in space into `nlist` units by the clustering method. During searching vectors, it compares the distance between the target vector and the center of all units, and then selects the `nprobe` nearest unit. Afterwards, it compares all the vectors in these selected cells to get the final result. |
|||
|
|||
IVF_FLAT is the most basic IVF index, and the encoded data stored in each unit is consistent with the original data. |
|||
|
|||
- building parameters: |
|||
|
|||
**nlist**: Number of cluster units. |
|||
|
|||
```python |
|||
# IVF_FLAT |
|||
{ |
|||
"index_type": "IVF_FLAT", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_FLAT |
|||
"nlist": 100 # int. 1~65536 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**nprobe**: Number of inverted file cells to probe. |
|||
|
|||
```python |
|||
# IVF_FLAT |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_FLAT |
|||
"nprobe": 8 # int. 1~nlist(cpu), 1~min[2048, nlist](gpu) |
|||
} |
|||
``` |
|||
|
|||
## BIN_IVF_FLAT |
|||
|
|||
**BIN_IVF_FLAT** is a binary variant of IVF_FLAT. |
|||
|
|||
- building parameters: |
|||
|
|||
**nlist**: Number of cluster units. |
|||
|
|||
```python |
|||
# BIN_IVF_FLAT |
|||
{ |
|||
"index_type": "BIN_IVF_FLAT", |
|||
"metric_type": "jaccard", # one of jaccard, hamming, tanimoto |
|||
|
|||
#Special for BIN_IVF_FLAT |
|||
"nlist": 100 # int. 1~65536 |
|||
} |
|||
|
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**nprobe**: Number of inverted file cells to probe. |
|||
|
|||
```python |
|||
# BIN_IVF_FLAT |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
|
|||
#Special for BIN_IVF_FLAT |
|||
"metric_type": "jaccard", # one of jaccard, hamming, tanimoto |
|||
"nprobe": 8 # int. 1~nlist(cpu), 1~min[2048, nlist](gpu) |
|||
} |
|||
``` |
|||
|
|||
## IVF_PQ |
|||
|
|||
**PQ** (_Product Quantization_) uniformly decomposes the original high-dimensional vector space into Cartesian products of `m` low-dimensional vector spaces and then quantizes the decomposed low-dimensional vector spaces. Instead of calculating the distances between the target vector and the center of all the units, product quantization enables the calculation of distances between the target vector and the clustering center of each low-dimensional space and greatly reduces the time complexity and space complexity of the algorithm. |
|||
|
|||
IVF_PQ performs IVF index clustering, and then quantizes the product of vectors. Its index file is even smaller than IVF_SQ8, but it also causes a loss of accuracy during searching. |
|||
|
|||
- building parameters: |
|||
|
|||
**nlist**: Number of cluster units. |
|||
|
|||
**m**: Number of factors of product quantization. **CPU-only** Milvus: `m ≡ dim (mod m)`; **GPU-enabled** Milvus: `m` ∈ {1, 2, 3, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 96}, and (dim / m) ∈ {1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32}. (`m` x 1024) ≥ `MaxSharedMemPerBlock` of your graphics card. |
|||
|
|||
```python |
|||
# IVF_PQ |
|||
{ |
|||
"index_type": "IVF_PQ", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_PQ |
|||
"nlist": 100, # int. 1~65536 |
|||
"m": 8 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**nprobe**: Number of inverted file cells to probe. |
|||
|
|||
```python |
|||
# IVF_PQ |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_PQ |
|||
"nprobe": 8 # int. 1~nlist(cpu), 1~min[2048, nlist](gpu) |
|||
} |
|||
``` |
|||
|
|||
## IVF_SQ8 |
|||
|
|||
**IVF_SQ8** does scalar quantization for each vector placed in the unit based on IVF. Scalar quantization converts each dimension of the original vector from a 4-byte floating-point number to a 1-byte unsigned integer, so the IVF_SQ8 index file occupies much less space than the IVF_FLAT index file. However, scalar quantization results in a loss of accuracy during searching vectors. |
|||
|
|||
- building parameters: |
|||
|
|||
**nlist**: Number of cluster units. |
|||
|
|||
```python |
|||
# IVF_SQ8 |
|||
{ |
|||
"index_type": "IVF_SQ8", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_SQ8 |
|||
"nlist": 100 # int. 1~65536 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**nprobe**: Number of inverted file cells to probe. |
|||
|
|||
```python |
|||
# IVF_SQ8 |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_SQ8 |
|||
"nprobe": 8 # int. 1~nlist(cpu), 1~min[2048, nlist](gpu) |
|||
} |
|||
``` |
|||
|
|||
## IVF_SQ8_HYBRID |
|||
|
|||
An optimized version of IVF_SQ8 that requires both CPU and GPU to work. Unlike IVF_SQ8, IVF_SQ8H uses a GPU-based coarse quantizer, which greatly reduces the time to quantize. |
|||
|
|||
IVF_SQ8H is an IVF_SQ8 index that optimizes query execution. |
|||
|
|||
The query method is as follows: |
|||
|
|||
- If `nq` ≥ `gpu_search_threshold`, GPU handles the entire query task. |
|||
- If `nq` < `gpu_search_threshold`, GPU handles the task of retrieving the `nprobe` nearest unit in the IVF index file, and CPU handles the rest. |
|||
|
|||
- building parameters: |
|||
|
|||
**nlist**: Number of cluster units. |
|||
|
|||
```python |
|||
# IVF_SQ8_HYBRID |
|||
{ |
|||
"index_type": "IVF_SQ8_HYBRID", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_SQ8_HYBRID |
|||
"nlist": 100 # int. 1~65536 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**nprobe**: Number of inverted file cells to probe. |
|||
|
|||
```python |
|||
# IVF_SQ8_HYBRID |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for IVF_SQ8_HYBRID |
|||
"nprobe": 8 # int. 1~nlist(cpu), 1~min[2048, nlist](gpu) |
|||
} |
|||
``` |
|||
|
|||
## ANNOY |
|||
|
|||
**ANNOY** (_Approximate Nearest Neighbors Oh Yeah_) is an index that uses a hyperplane to divide a high-dimensional space into multiple subspaces, and then stores them in a tree structure. |
|||
|
|||
When searching for vectors, ANNOY follows the tree structure to find subspaces closer to the target vector, and then compares all the vectors in these subspaces (The number of vectors being compared should not be less than `search_k`) to obtain the final result. Obviously, when the target vector is close to the edge of a certain subspace, sometimes it is necessary to greatly increase the number of searched subspaces to obtain a high recall rate. Therefore, ANNOY uses `n_trees` different methods to divide the whole space, and searches all the dividing methods simultaneously to reduce the probability that the target vector is always at the edge of the subspace. |
|||
|
|||
- building parameters: |
|||
|
|||
**n_trees**: The number of methods of space division. |
|||
|
|||
```python |
|||
# ANNOY |
|||
{ |
|||
"index_type": "ANNOY", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for ANNOY |
|||
"n_trees": 8 # int. 1~1024 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**search_k**: The number of nodes to search. -1 means 5% of the whole data. |
|||
|
|||
```python |
|||
# ANNOY |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for ANNOY |
|||
"search_k": -1 # int. {-1} U [top_k, n*n_trees], n represents vectors count. |
|||
} |
|||
``` |
|||
|
|||
## HNSW |
|||
|
|||
**HNSW** (_Hierarchical Navigable Small World Graph_) is a graph-based indexing algorithm. It builds a multi-layer navigation structure for an image according to certain rules. In this structure, the upper layers are more sparse and the distances between nodes are farther; the lower layers are denser and the distances between nodes are closer. The search starts from the uppermost layer, finds the node closest to the target in this layer, and then enters the next layer to begin another search. After multiple iterations, it can quickly approach the target position. |
|||
|
|||
To improve performance, HNSW limits the maximum degree of nodes on each layer of the graph to `M`. |
|||
In addition, you can use `efConstruction` (when building index) or `ef` (when searching targets) to specify a search range. |
|||
|
|||
- building parameters: |
|||
|
|||
**M**: Maximum degree of the node. |
|||
|
|||
**efConstruction**: Take the effect in the stage of index construction. |
|||
|
|||
```python |
|||
# HNSW |
|||
{ |
|||
"index_type": "HNSW", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for HNSW |
|||
"M": 16, # int. 4~64 |
|||
"efConstruction": 40 # int. 8~512 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**ef**: Take the effect in the stage of search scope, should be larger than `top_k`. |
|||
|
|||
```python |
|||
# HNSW |
|||
|
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for HNSW |
|||
"ef": 64 # int. top_k~32768 |
|||
} |
|||
``` |
|||
|
|||
## RHNSW_PQ |
|||
|
|||
**RHNSW_PQ** is a variant index type combining PQ and HNSW. It first uses PQ to quantize the vector, then uses HNSW to quantize the PQ quantization result to get the index. |
|||
|
|||
- building parameters: |
|||
|
|||
**M**: Maximum degree of the node. |
|||
|
|||
**efConstruction**: Take effect in the stage of index construction. |
|||
|
|||
**PQM**: m for PQ. |
|||
|
|||
```python |
|||
# RHNSW_PQ |
|||
{ |
|||
"index_type": "RHNSW_PQ", |
|||
"metric_type": "L2", |
|||
|
|||
#Special for RHNSW_PQ |
|||
"M": 16, # int. 4~64 |
|||
"efConstruction": 40, # int. 8~512 |
|||
"PQM": 8, # int. CPU only. PQM = dim (mod m) |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**ef**: Take the effect in the stage of search scope, should be larger than `top_k`. |
|||
|
|||
```python |
|||
# RHNSW_PQ |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
|
|||
#Special for RHNSW_PQ |
|||
"ef": 64 # int. top_k~32768 |
|||
} |
|||
``` |
|||
|
|||
## RHNSW_SQ |
|||
|
|||
**RHNSW_SQ** is a variant index type combining SQ and HNSW. It uses SQ to quantize the vector, then uses HNSW to quantize the SQ quantization result to get the index. |
|||
|
|||
- building parameters: |
|||
|
|||
**M**: Maximum degree of the node. |
|||
|
|||
**efConstruction**: Take effect in the stage of index construction, search scope. |
|||
|
|||
```python |
|||
# RHNSW_SQ |
|||
{ |
|||
"index_type": "RHNSW_SQ", |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for RHNSW_SQ |
|||
"M": 16, # int. 4~64 |
|||
"efConstruction": 40 # int. 8~512 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**ef**: Take the effect in the stage of search scope, should be larger than `top_k`. |
|||
|
|||
```python |
|||
# RHNSW_SQ |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for RHNSW_SQ |
|||
"ef": 64 # int. top_k~32768 |
|||
} |
|||
``` |
|||
|
|||
## NSG |
|||
|
|||
**NSG** (_Refined Navigating Spreading-out Graph_) is a graph-based indexing algorithm. It sets the center position of the whole image as a navigation point, and then uses a specific edge selection strategy to control the out-degree of each point (less than or equal to `out_degree`). Therefore, it can reduce memory usage and quickly locate the target position nearby during searching vectors. |
|||
|
|||
The graph construction process of NSG is as follows: |
|||
|
|||
1. Find `knng` nearest neighbors for each point. |
|||
2. Iterate at least `search_length` times based on `knng` nearest neighbor nodes to select `candidate_pool_size` possible nearest neighbor nodes. |
|||
3. Construct the out-edge of each point in the selected `candidate_pool_size` nodes according to the edge selection strategy. |
|||
|
|||
The query process is similar to the graph building process. It starts from the navigation point and iterates at least `search_length` times to get the final result. |
|||
|
|||
- building parameters: |
|||
|
|||
**search_length**: Number of query iterations. |
|||
|
|||
**out_degree**: Maximum out-degree of the node. |
|||
|
|||
**candidate_pool_size**: Candidate pool size of the node. |
|||
|
|||
**knng**: Number of nearest neighbors |
|||
|
|||
```python |
|||
# NSG |
|||
{ |
|||
"index_type": "NSG", |
|||
"metric_type": "L2", |
|||
|
|||
#Special for RHNSW_SQ |
|||
"search_length": 60, # int. 10~300 |
|||
"out_degree": 30, # int. 5~300 |
|||
"candidate_pool_size": 300, # int. 50~1000 |
|||
"knng": 50 # int. 5~300 |
|||
} |
|||
``` |
|||
|
|||
- search parameters: |
|||
|
|||
**search_length**: Number of query iterations |
|||
|
|||
```python |
|||
# NSG |
|||
{ |
|||
"topk": top_k, |
|||
"query": queries, |
|||
"metric_type": "L2", # one of L2, IP |
|||
|
|||
#Special for RHNSW_SQ |
|||
"search_length": 100 # int. 10~300 |
|||
} |
|||
``` |
@ -0,0 +1,201 @@ |
|||
## 3. Index Service |
|||
|
|||
#### 3.1 Overview |
|||
|
|||
<img src="./figs/index_coord.png" width=700> |
|||
|
|||
#### 3.2 Index Service Interface |
|||
|
|||
```go |
|||
type IndexCoord interface { |
|||
Component |
|||
// TimeTickProvider is the interface all services implement |
|||
TimeTickProvider |
|||
|
|||
// BuildIndex receives requests from RootCoordinator to build an index. |
|||
// Index building is asynchronous, so when an index building request comes, an IndexBuildID is assigned to the task and |
|||
// the task is recorded in Meta. The background process assignTaskLoop will find this task and assign it to IndexNode for |
|||
// execution. |
|||
BuildIndex(ctx context.Context, req *indexpb.BuildIndexRequest) (*indexpb.BuildIndexResponse, error) |
|||
|
|||
// DropIndex deletes indexes based on IndexID. One IndexID corresponds to the index of an entire column. A column is |
|||
// divided into many segments, and each segment corresponds to an IndexBuildID. IndexCoord uses IndexBuildID to record |
|||
// index tasks. Therefore, when DropIndex is called, delete all tasks corresponding to IndexBuildID corresponding to IndexID. |
|||
DropIndex(ctx context.Context, req *indexpb.DropIndexRequest) (*commonpb.Status, error) |
|||
|
|||
// GetIndexStates gets the index states of the IndexBuildIDs in the request from RootCoordinator. |
|||
GetIndexStates(ctx context.Context, req *indexpb.GetIndexStatesRequest) (*indexpb.GetIndexStatesResponse, error) |
|||
|
|||
// GetIndexFilePaths gets the index files of the IndexBuildIDs in the request from RootCoordinator. |
|||
GetIndexFilePaths(ctx context.Context, req *indexpb.GetIndexFilePathsRequest) (*indexpb.GetIndexFilePathsResponse, error) |
|||
|
|||
// GetMetrics gets the metrics about IndexCoord. |
|||
GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) |
|||
} |
|||
``` |
|||
|
|||
- _RegisterNode_ |
|||
|
|||
```go |
|||
type MsgBase struct { |
|||
MsgType MsgType |
|||
MsgID UniqueID |
|||
Timestamp uint64 |
|||
SourceID UniqueID |
|||
} |
|||
|
|||
type Address struct { |
|||
Ip string |
|||
Port int64 |
|||
} |
|||
|
|||
type RegisterNodeRequest struct { |
|||
Base *commonpb.MsgBase |
|||
Address *commonpb.Address |
|||
} |
|||
|
|||
type InitParams struct { |
|||
NodeID UniqueID |
|||
StartParams []*commonpb.KeyValuePair |
|||
} |
|||
|
|||
type RegisterNodeResponse struct { |
|||
InitParams *internalpb.InitParams |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _BuildIndex_ |
|||
|
|||
```go |
|||
type KeyValuePair struct { |
|||
Key string |
|||
Value string |
|||
} |
|||
|
|||
type BuildIndexRequest struct { |
|||
IndexBuildID UniqueID |
|||
IndexName string |
|||
IndexID UniqueID |
|||
DataPaths []string |
|||
TypeParams []*commonpb.KeyValuePair |
|||
IndexParams []*commonpb.KeyValuePair |
|||
} |
|||
|
|||
type BuildIndexResponse struct { |
|||
Status *commonpb.Status |
|||
IndexBuildID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _DropIndex_ |
|||
|
|||
```go |
|||
type DropIndexRequest struct { |
|||
IndexID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _GetIndexStates_ |
|||
|
|||
```go |
|||
type GetIndexStatesRequest struct { |
|||
IndexBuildIDs []UniqueID |
|||
} |
|||
|
|||
const ( |
|||
IndexState_IndexStateNone IndexState = 0 |
|||
IndexState_Unissued IndexState = 1 |
|||
IndexState_InProgress IndexState = 2 |
|||
IndexState_Finished IndexState = 3 |
|||
IndexState_Failed IndexState = 4 |
|||
IndexState_Deleted IndexState = 5 |
|||
) |
|||
|
|||
type IndexInfo struct { |
|||
State commonpb.IndexState |
|||
IndexBuildID UniqueID |
|||
IndexID UniqueID |
|||
IndexName string |
|||
Reason string |
|||
} |
|||
|
|||
type GetIndexStatesResponse struct { |
|||
Status *commonpb.Status |
|||
States []*IndexInfo |
|||
} |
|||
``` |
|||
|
|||
- _GetIndexFilePaths_ |
|||
|
|||
```go |
|||
type GetIndexFilePathsRequest struct { |
|||
IndexBuildIDs []UniqueID |
|||
} |
|||
|
|||
type IndexFilePathInfo struct { |
|||
Status *commonpb.Status |
|||
IndexBuildID UniqueID |
|||
IndexFilePaths []string |
|||
} |
|||
|
|||
type GetIndexFilePathsResponse struct { |
|||
Status *commonpb.Status |
|||
FilePaths []*IndexFilePathInfo |
|||
} |
|||
|
|||
``` |
|||
|
|||
- _NotifyBuildIndex_ |
|||
|
|||
```go |
|||
type NotifyBuildIndexRequest struct { |
|||
Status *commonpb.Status |
|||
IndexBuildID UniqueID |
|||
IndexFilePaths []string |
|||
NodeID UniqueID |
|||
} |
|||
``` |
|||
|
|||
#### 3.3 Index Node Interface |
|||
|
|||
```go |
|||
type IndexNode interface { |
|||
Component |
|||
// TimeTickProvider is the interface all services implement |
|||
TimeTickProvider |
|||
|
|||
// CreateIndex receives requests from IndexCoordinator to build an index. |
|||
// Index building is asynchronous, so when an index building request comes, IndexNode records the task and returns. |
|||
BuildIndex(ctx context.Context, req *indexpb.BuildIndexRequest) (*commonpb.Status, error) |
|||
// GetMetrics gets the metrics about IndexNode. |
|||
DropIndex(ctx context.Context, req *indexpb.DropIndexRequest) (*commonpb.Status, error) |
|||
} |
|||
``` |
|||
|
|||
- _BuildIndex_ |
|||
|
|||
```go |
|||
|
|||
type KeyValuePair struct { |
|||
Key string |
|||
Value string |
|||
} |
|||
|
|||
type BuildIndexRequest struct { |
|||
IndexBuildID UniqueID |
|||
IndexName string |
|||
IndexID UniqueID |
|||
DataPaths []string |
|||
TypeParams []*commonpb.KeyValuePair |
|||
IndexParams []*commonpb.KeyValuePair |
|||
} |
|||
``` |
|||
|
|||
- _DropIndex_ |
|||
|
|||
```go |
|||
type DropIndexRequest struct { |
|||
IndexID UniqueID |
|||
} |
|||
``` |
@ -0,0 +1,319 @@ |
|||
## 8. Message Stream |
|||
|
|||
// TODO remove? |
|||
|
|||
#### 8.2 Message Stream Service API |
|||
|
|||
```go |
|||
type Client interface { |
|||
CreateChannels(req CreateChannelRequest) (CreateChannelResponse, error) |
|||
DestroyChannels(req DestroyChannelRequest) error |
|||
DescribeChannels(req DescribeChannelRequest) (DescribeChannelResponse, error) |
|||
} |
|||
``` |
|||
|
|||
- _CreateChannels_ |
|||
|
|||
```go |
|||
type OwnerDescription struct { |
|||
Role string |
|||
Address string |
|||
//Token string |
|||
DescriptionText string |
|||
} |
|||
|
|||
type CreateChannelRequest struct { |
|||
OwnerDescription OwnerDescription |
|||
NumChannels int |
|||
} |
|||
|
|||
type CreateChannelResponse struct { |
|||
ChannelNames []string |
|||
} |
|||
``` |
|||
|
|||
- _DestroyChannels_ |
|||
|
|||
```go |
|||
type DestroyChannelRequest struct { |
|||
ChannelNames []string |
|||
} |
|||
``` |
|||
|
|||
- _DescribeChannels_ |
|||
|
|||
```go |
|||
type DescribeChannelRequest struct { |
|||
ChannelNames []string |
|||
} |
|||
|
|||
type ChannelDescription struct { |
|||
ChannelName string |
|||
Owner OwnerDescription |
|||
} |
|||
|
|||
type DescribeChannelResponse struct { |
|||
Descriptions []ChannelDescription |
|||
} |
|||
``` |
|||
|
|||
#### A.3 Message Stream |
|||
|
|||
- Overview |
|||
|
|||
<img src="./figs/msg_stream_input_output.jpeg" width=700> |
|||
|
|||
- Interface |
|||
|
|||
```go |
|||
// Msg |
|||
|
|||
type MsgType uint32 |
|||
const ( |
|||
MsgType_Undefined MsgType = 0 |
|||
// DEFINITION REQUESTS: COLLECTION |
|||
MsgType_CreateCollection MsgType = 100 |
|||
MsgType_DropCollection MsgType = 101 |
|||
MsgType_HasCollection MsgType = 102 |
|||
MsgType_DescribeCollection MsgType = 103 |
|||
MsgType_ShowCollections MsgType = 104 |
|||
MsgType_GetSystemConfigs MsgType = 105 |
|||
MsgType_LoadCollection MsgType = 106 |
|||
MsgType_ReleaseCollection MsgType = 107 |
|||
MsgType_CreateAlias MsgType = 108 |
|||
MsgType_DropAlias MsgType = 109 |
|||
MsgType_AlterAlias MsgType = 110 |
|||
// DEFINITION REQUESTS: PARTITION |
|||
MsgType_CreatePartition MsgType = 200 |
|||
MsgType_DropPartition MsgType = 201 |
|||
MsgType_HasPartition MsgType = 202 |
|||
MsgType_DescribePartition MsgType = 203 |
|||
MsgType_ShowPartitions MsgType = 204 |
|||
MsgType_LoadPartitions MsgType = 205 |
|||
MsgType_ReleasePartitions MsgType = 206 |
|||
// DEFINE REQUESTS: SEGMENT |
|||
MsgType_ShowSegments MsgType = 250 |
|||
MsgType_DescribeSegment MsgType = 251 |
|||
MsgType_LoadSegments MsgType = 252 |
|||
MsgType_ReleaseSegments MsgType = 253 |
|||
MsgType_HandoffSegments MsgType = 254 |
|||
MsgType_LoadBalanceSegments MsgType = 255 |
|||
// DEFINITION REQUESTS: INDEX |
|||
MsgType_CreateIndex MsgType = 300 |
|||
MsgType_DescribeIndex MsgType = 301 |
|||
MsgType_DropIndex MsgType = 302 |
|||
// MANIPULATION REQUESTS |
|||
MsgType_Insert MsgType = 400 |
|||
MsgType_Delete MsgType = 401 |
|||
MsgType_Flush MsgType = 402 |
|||
// QUERY |
|||
MsgType_Search MsgType = 500 |
|||
MsgType_SearchResult MsgType = 501 |
|||
MsgType_GetIndexState MsgType = 502 |
|||
MsgType_GetIndexBuildProgress MsgType = 503 |
|||
MsgType_GetCollectionStatistics MsgType = 504 |
|||
MsgType_GetPartitionStatistics MsgType = 505 |
|||
MsgType_Retrieve MsgType = 506 |
|||
MsgType_RetrieveResult MsgType = 507 |
|||
MsgType_WatchDmChannels MsgType = 508 |
|||
MsgType_RemoveDmChannels MsgType = 509 |
|||
MsgType_WatchQueryChannels MsgType = 510 |
|||
MsgType_RemoveQueryChannels MsgType = 511 |
|||
// DATA SERVICE |
|||
MsgType_SegmentInfo MsgType = 600 |
|||
MsgType_SystemInfo MsgType = 601 |
|||
// SYSTEM CONTROL |
|||
MsgType_TimeTick MsgType = 1200 |
|||
MsgType_QueryNodeStats MsgType = 1201 |
|||
MsgType_LoadIndex MsgType = 1202 |
|||
MsgType_RequestID MsgType = 1203 |
|||
MsgType_RequestTSO MsgType = 1204 |
|||
MsgType_AllocateSegment MsgType = 1205 |
|||
MsgType_SegmentStatistics MsgType = 1206 |
|||
MsgType_SegmentFlushDone MsgType = 1207 |
|||
MsgType_DataNodeTt MsgType = 1208 |
|||
) |
|||
|
|||
type MsgPosition struct{ |
|||
ChannelName string |
|||
MsgID []byte |
|||
MsgGroup string |
|||
Timestamp uint64 |
|||
} |
|||
|
|||
type MsgPack struct { |
|||
BeginTs Timestamp |
|||
EndTs Timestamp |
|||
Msgs []TsMsg |
|||
StartPositions []*MsgPosition |
|||
EndPositions []*MsgPosition |
|||
} |
|||
|
|||
type TsMsg interface { |
|||
TraceCtx() context.Context |
|||
SetTraceCtx(ctx context.Context) |
|||
ID() UniqueID |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
Type() MsgType |
|||
SourceID() int64 |
|||
HashKeys() []uint32 |
|||
Marshal(TsMsg) (MarshalType, error) |
|||
Unmarshal(MarshalType) (TsMsg, error) |
|||
Position() *MsgPosition |
|||
SetPosition(*MsgPosition) |
|||
} |
|||
|
|||
type RepackFunc func(msgs []TsMsg, hashKeys [][]int32) (map[int32]*MsgPack, error) |
|||
``` |
|||
|
|||
```go |
|||
// Unmarshal |
|||
|
|||
// Interface |
|||
type UnmarshalFunc func(interface{}) (TsMsg, error) |
|||
|
|||
// UnmarshalDispatcher is an interface that contains method Unmarshal |
|||
type UnmarshalDispatcher interface { |
|||
Unmarshal(input interface{}, msgType commonpb.MsgType) (TsMsg, error) |
|||
AddMsgTemplate(msgType commonpb.MsgType, unmarshalFunc UnmarshalFunc) |
|||
} |
|||
|
|||
type UnmarshalDispatcherFactory interface { |
|||
NewUnmarshalDispatcher() *UnmarshalDispatcher |
|||
} |
|||
|
|||
// Proto & Mem Implementation |
|||
type ProtoUDFactory struct {} |
|||
func (pudf *ProtoUDFactory) NewUnmarshalDispatcher() *ProtoUnmarshalDispatcher |
|||
|
|||
// TODO |
|||
type MemUDFactory struct {} |
|||
func (mudf *MemUDFactory) NewUnmarshalDispatcher() *UnmarshalDispatcher |
|||
``` |
|||
|
|||
```go |
|||
// MsgStream is an interface that can be used to produce and consume message on message queue |
|||
|
|||
// Interface |
|||
type MsgStream interface { |
|||
Start() |
|||
Close() |
|||
Chan() <-chan *MsgPack |
|||
AsProducer(channels []string) |
|||
AsConsumer(channels []string, subName string) |
|||
SetRepackFunc(repackFunc RepackFunc) |
|||
ComputeProduceChannelIndexes(tsMsgs []TsMsg) [][]int32 |
|||
GetProduceChannels() []string |
|||
Produce(*MsgPack) error |
|||
Broadcast(*MsgPack) error |
|||
BroadcastMark(*MsgPack) (map[string][]MessageID, error) |
|||
Consume() *MsgPack |
|||
Seek(offset []*MsgPosition) error |
|||
} |
|||
|
|||
type Factory interface { |
|||
Init(params *paramtable.ComponentParam) error |
|||
NewMsgStream(ctx context.Context) (MsgStream, error) |
|||
NewTtMsgStream(ctx context.Context) (MsgStream, error) |
|||
} |
|||
|
|||
// Pulsar |
|||
type PmsFactory struct { |
|||
dispatcherFactory ProtoUDFactory |
|||
// the following members must be public, so that mapstructure.Decode() can access them |
|||
PulsarAddress string |
|||
ReceiveBufSize int64 |
|||
PulsarBufSize int64 |
|||
} |
|||
|
|||
// RmsFactory |
|||
type RmsFactory struct { |
|||
dispatcherFactory ProtoUDFactory |
|||
ReceiveBufSize int64 |
|||
RmqBufSize int64 |
|||
} |
|||
``` |
|||
|
|||
````go |
|||
|
|||
// mqMsgStream |
|||
type mqMsgStream struct { |
|||
ctx context.Context |
|||
client mqclient.Client |
|||
producers map[string]mqclient.Producer |
|||
producerChannels []string |
|||
consumers map[string]mqclient.Consumer |
|||
consumerChannels []string |
|||
repackFunc RepackFunc |
|||
unmarshal UnmarshalDispatcher |
|||
receiveBuf chan *MsgPack |
|||
wait *sync.WaitGroup |
|||
streamCancel func() |
|||
bufSize int64 |
|||
producerLock *sync.Mutex |
|||
consumerLock *sync.Mutex |
|||
} |
|||
|
|||
|
|||
|
|||
#### A.4 RocksMQ |
|||
|
|||
RocksMQ is a RocksDB-based messaging/streaming library. |
|||
|
|||
```GO |
|||
// All the following UniqueIDs are 64-bit integer, which is combined with timestamp and increasing number |
|||
|
|||
type ProducerMessage struct { |
|||
payload []byte |
|||
} |
|||
|
|||
type ConsumerMessage struct { |
|||
msgID UniqueID |
|||
payload []byte |
|||
} |
|||
|
|||
type IDAllocator interface { |
|||
Alloc(count uint32) (UniqueID, UniqueID, error) |
|||
AllocOne() (UniqueID, error) |
|||
UpdateID() error |
|||
} |
|||
|
|||
// Every collection has its RocksMQ |
|||
type RocksMQ struct { |
|||
store *gorocksdb.DB |
|||
kv kv.Base |
|||
idAllocator IDAllocator |
|||
produceMu sync.Mutex |
|||
consumeMu sync.Mutex |
|||
} |
|||
|
|||
func (rmq *RocksMQ) CreateChannel(channelName string) error |
|||
func (rmq *RocksMQ) DestroyChannel(channelName string) error |
|||
func (rmq *RocksMQ) CreateConsumerGroup(groupName string) error |
|||
func (rmq *RocksMQ) DestroyConsumerGroup(groupName string) error |
|||
func (rmq *RocksMQ) Produce(channelName string, messages []ProducerMessage) error |
|||
func (rmq *RocksMQ) Consume(groupName string, channelName string, n int) ([]ConsumerMessage, error) |
|||
func (rmq *RocksMQ) Seek(groupName string, channelName string, msgID MessageID) error |
|||
|
|||
func NewRocksMQ(name string, idAllocator IDAllocator) (*RocksMQ, error) |
|||
```` |
|||
|
|||
##### A.4.1 Meta (stored in etcd) |
|||
|
|||
```go |
|||
// channel meta |
|||
"$(channel_name)/begin_id", UniqueID |
|||
"$(channel_name)/end_id", UniqueID |
|||
|
|||
// consumer group meta |
|||
"$(group_name)/$(channel_name)/current_id", UniqueID |
|||
``` |
|||
|
|||
##### A.4.2 Data (stored in RocksDB) |
|||
|
|||
- data |
|||
|
|||
```go |
|||
"$(channel_name)/$(unique_id)", []byte |
|||
``` |
@ -0,0 +1,648 @@ |
|||
## 5. Proxy |
|||
|
|||
<img src="./figs/proxy.png" width=700> |
|||
|
|||
#### 5.0 Proxy Service Interface |
|||
|
|||
```go |
|||
type ProxyService interface { |
|||
Component |
|||
TimeTickProvider |
|||
|
|||
RegisterNode(ctx context.Context, request *proxypb.RegisterNodeRequest) (*proxypb.RegisterNodeResponse, error) |
|||
InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest) (*commonpb.Status, error) |
|||
} |
|||
``` |
|||
|
|||
- _MsgBase_ |
|||
|
|||
```go |
|||
|
|||
type MsgBase struct { |
|||
MsgType MsgType |
|||
MsgID UniqueID |
|||
Timestamp uint64 |
|||
SourceID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _RegisterNode_ |
|||
|
|||
```go |
|||
type Address struct { |
|||
Ip string |
|||
Port int64 |
|||
} |
|||
|
|||
type RegisterNodeRequest struct { |
|||
Base *commonpb.MsgBase |
|||
Address string |
|||
Port int64 |
|||
} |
|||
|
|||
type InitParams struct { |
|||
NodeID UniqueID |
|||
StartParams []*commonpb.KeyValuePair |
|||
} |
|||
|
|||
type RegisterNodeResponse struct { |
|||
InitParams *internalpb.InitParams |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _InvalidateCollectionMetaCache_ |
|||
|
|||
```go |
|||
type InvalidateCollMetaCacheRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
#### 5.1 Proxy Node Interface |
|||
|
|||
```go |
|||
type Proxy interface { |
|||
Component |
|||
|
|||
// InvalidateCollectionMetaCache notifies Proxy to clear all the meta cache of specific collection. |
|||
InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest) (*commonpb.Status, error) |
|||
} |
|||
``` |
|||
|
|||
- _InvalidateCollectionMetaCache_ |
|||
|
|||
```go |
|||
type InvalidateCollMetaCacheRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
#### 5.2 Milvus Service Interface |
|||
|
|||
Proxy also implements Milvus Service interface to receive client grpc call. |
|||
|
|||
```go |
|||
type MilvusService interface { |
|||
// CreateCollection creates a collection |
|||
CreateCollection(ctx context.Context, request *milvuspb.CreateCollectionRequest) (*commonpb.Status, error) |
|||
DropCollection(ctx context.Context, request *milvuspb.DropCollectionRequest) (*commonpb.Status, error) |
|||
HasCollection(ctx context.Context, request *milvuspb.HasCollectionRequest) (*milvuspb.BoolResponse, error) |
|||
LoadCollection(ctx context.Context, request *milvuspb.LoadCollectionRequest) (*commonpb.Status, error) |
|||
ReleaseCollection(ctx context.Context, request *milvuspb.ReleaseCollectionRequest) (*commonpb.Status, error) |
|||
DescribeCollection(ctx context.Context, request *milvuspb.DescribeCollectionRequest) (*milvuspb.DescribeCollectionResponse, error) |
|||
GetCollectionStatistics(ctx context.Context, request *milvuspb.CollectionStatsRequest) (*milvuspb.CollectionStatsResponse, error) |
|||
ShowCollections(ctx context.Context, request *milvuspb.ShowCollectionRequest) (*milvuspb.ShowCollectionResponse, error) |
|||
|
|||
CreateAlias(ctx context.Context, request *milvuspb.CreateAliasRequest) (*commonpb.Status, error) |
|||
DropAlias(ctx context.Context, request *milvuspb.DropAliasRequest) (*commonpb.Status, error) |
|||
AlterAlias(ctx context.Context, request *milvuspb.AlterAliasRequest) (*commonpb.Status, error) |
|||
|
|||
CreatePartition(ctx context.Context, request *milvuspb.CreatePartitionRequest) (*commonpb.Status, error) |
|||
DropPartition(ctx context.Context, request *milvuspb.DropPartitionRequest) (*commonpb.Status, error) |
|||
HasPartition(ctx context.Context, request *milvuspb.HasPartitionRequest) (*milvuspb.BoolResponse, error) |
|||
LoadPartitions(ctx context.Context, request *milvuspb.LoadPartitionRequest) (*commonpb.Status, error) |
|||
ReleasePartitions(ctx context.Context, request *milvuspb.ReleasePartitionRequest) (*commonpb.Status, error) |
|||
GetPartitionStatistics(ctx context.Context, request *milvuspb.PartitionStatsRequest) (*milvuspb.PartitionStatsResponse, error) |
|||
ShowPartitions(ctx context.Context, request *milvuspb.ShowPartitionRequest) (*milvuspb.ShowPartitionResponse, error) |
|||
|
|||
CreateIndex(ctx context.Context, request *milvuspb.CreateIndexRequest) (*commonpb.Status, error) |
|||
DescribeIndex(ctx context.Context, request *milvuspb.DescribeIndexRequest) (*milvuspb.DescribeIndexResponse, error) |
|||
GetIndexState(ctx context.Context, request *milvuspb.IndexStateRequest) (*milvuspb.IndexStateResponse, error) |
|||
DropIndex(ctx context.Context, request *milvuspb.DropIndexRequest) (*commonpb.Status, error) |
|||
|
|||
Insert(ctx context.Context, request *milvuspb.InsertRequest) (*milvuspb.InsertResponse, error) |
|||
Search(ctx context.Context, request *milvuspb.SearchRequest) (*milvuspb.SearchResults, error) |
|||
Flush(ctx context.Context, request *milvuspb.FlushRequest) (*commonpb.Status, error) |
|||
|
|||
GetDdChannel(ctx context.Context, request *commonpb.Empty) (*milvuspb.StringResponse, error) |
|||
|
|||
GetQuerySegmentInfo(ctx context.Context, req *milvuspb.QuerySegmentInfoRequest) (*milvuspb.QuerySegmentInfoResponse, error) |
|||
GetPersistentSegmentInfo(ctx context.Context, req *milvuspb.PersistentSegmentInfoRequest) (*milvuspb.PersistentSegmentInfoResponse, error) |
|||
GetQuerySegmentInfo(ctx context.Context, in *GetQuerySegmentInfoRequest, opts ...grpc.CallOption) (*GetQuerySegmentInfoResponse, error) |
|||
|
|||
} |
|||
} |
|||
``` |
|||
|
|||
- _CreateCollection_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _DropCollection_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _HasCollection_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _LoadCollection_ |
|||
|
|||
```go |
|||
type LoadCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
- _ReleaseCollection_ |
|||
|
|||
```go |
|||
type ReleaseCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
- _DescribeCollection_ |
|||
|
|||
```go |
|||
type DescribeCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
CollectionID int64 |
|||
TimeStamp uint64 |
|||
} |
|||
``` |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _GetCollectionStatisticsRequest_ |
|||
|
|||
```go |
|||
type GetCollectionStatisticsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _ShowCollections_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _CreateAlias_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _DropAlias_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _AlterAlias_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _CreatePartition_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _DropPartition_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _HasPartition_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _LoadPartitions_ |
|||
|
|||
```go |
|||
type CollectionSchema struct { |
|||
Name string |
|||
Description string |
|||
AutoID bool |
|||
Fields []*FieldSchema |
|||
} |
|||
|
|||
type LoadPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
Schema *schemapb.CollectionSchema |
|||
} |
|||
``` |
|||
|
|||
- _ReleasePartitions_ |
|||
|
|||
```go |
|||
type ReleasePartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionNames []string |
|||
} |
|||
``` |
|||
|
|||
- _GetPartitionStatistics_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _ShowPartitions_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _CreateIndex_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _DescribeIndex_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _DropIndex_ |
|||
|
|||
See _Master API_ for detailed definitions. |
|||
|
|||
- _Insert_ |
|||
|
|||
```go |
|||
type InsertRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
RowData []Blob |
|||
HashKeys []uint32 |
|||
} |
|||
|
|||
type InsertResponse struct { |
|||
Status *commonpb.Status |
|||
RowIDBegin int64 |
|||
RowIDEnd int64 |
|||
} |
|||
``` |
|||
|
|||
- _Search_ |
|||
|
|||
```go |
|||
type SearchRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionNames []string |
|||
Dsl string |
|||
PlaceholderGroup []byte |
|||
} |
|||
|
|||
type SearchResults struct { |
|||
Status commonpb.Status |
|||
Hits byte |
|||
} |
|||
``` |
|||
|
|||
- _Flush_ |
|||
|
|||
```go |
|||
type FlushRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
- _GetPersistentSegmentInfo_ |
|||
|
|||
```go |
|||
type PersistentSegmentInfoRequest struct{ |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
|
|||
type SegmentState int32 |
|||
|
|||
const ( |
|||
SegmentState_SegmentNone SegmentState = 0 |
|||
SegmentState_SegmentNotExist SegmentState = 1 |
|||
SegmentState_SegmentGrowing SegmentState = 2 |
|||
SegmentState_SegmentSealed SegmentState = 3 |
|||
SegmentState_SegmentFlushed SegmentState = 4 |
|||
) |
|||
|
|||
type PersistentSegmentInfo struct { |
|||
SegmentID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
OpenTime Timestamp |
|||
SealedTime Timestamp |
|||
FlushedTime Timestamp |
|||
NumRows int64 |
|||
MemSize int64 |
|||
State SegmentState |
|||
} |
|||
|
|||
type PersistentSegmentInfoResponse struct{ |
|||
infos []*milvuspb.SegmentInfo |
|||
} |
|||
|
|||
``` |
|||
|
|||
#### 5.3 Proxy Instance |
|||
|
|||
```go |
|||
type Proxy struct { |
|||
ctx context.Context |
|||
cancel func() |
|||
wg sync.WaitGroup |
|||
|
|||
initParams *internalpb.InitParams |
|||
ip string |
|||
port int |
|||
|
|||
stateCode commonpb.StateCode |
|||
|
|||
rootCoordClient RootCoordClient |
|||
indexCoordClient IndexCoordClient |
|||
dataCoordClient DataCoordClient |
|||
queryCoordClient QueryCoordClient |
|||
|
|||
sched *TaskScheduler |
|||
tick *timeTick |
|||
|
|||
idAllocator *allocator.IDAllocator |
|||
tsoAllocator *allocator.TimestampAllocator |
|||
segAssigner *SegIDAssigner |
|||
|
|||
manipulationMsgStream msgstream.MsgStream |
|||
queryMsgStream msgstream.MsgStream |
|||
msFactory msgstream.Factory |
|||
|
|||
// Add callback functions at different stages |
|||
startCallbacks []func() |
|||
closeCallbacks []func() |
|||
} |
|||
|
|||
func (node *NodeImpl) Init() error |
|||
func (node *NodeImpl) Start() error |
|||
func (node *NodeImpl) Stop() error |
|||
func (node *NodeImpl) AddStartCallback(callbacks ...func()) |
|||
func (node *NodeImpl) waitForServiceReady(ctx context.Context, service Component, serviceName string) error |
|||
func (node *NodeImpl) lastTick() Timestamp |
|||
func (node *NodeImpl) AddCloseCallback(callbacks ...func()) |
|||
func (node *NodeImpl) SetRootCoordClient(cli RootCoordClient) |
|||
func (node *NodeImpl) SetIndexCoordClient(cli IndexCoordClient) |
|||
func (node *NodeImpl) SetDataCoordClient(cli DataCoordClient) |
|||
func (node *NodeImpl) SetProxyCoordClient(cli ProxyCoordClient) |
|||
func (node *NodeImpl) SetQueryCoordClient(cli QueryCoordClient) |
|||
|
|||
func NewProxyImpl(ctx context.Context, factory msgstream.Factory) (*NodeImpl, error) |
|||
``` |
|||
|
|||
#### Global Parameter Table |
|||
|
|||
```go |
|||
type GlobalParamsTable struct { |
|||
paramtable.BaseTable |
|||
|
|||
NetworkPort int |
|||
IP string |
|||
NetworkAddress string |
|||
|
|||
MasterAddress string |
|||
PulsarAddress string |
|||
RocksmqPath string |
|||
|
|||
RocksmqRetentionTimeInMinutes int64 |
|||
RocksmqRetentionSizeInMB int64 |
|||
|
|||
ProxyID UniqueID |
|||
TimeTickInterval time.Duration |
|||
InsertChannelNames []string |
|||
DeleteChannelNames []string |
|||
K2SChannelNames []string |
|||
SearchChannelNames []string |
|||
SearchResultChannelNames []string |
|||
ProxySubName string |
|||
ProxyTimeTickChannelNames []string |
|||
DataDefinitionChannelNames []string |
|||
MsgStreamTimeTickBufSize int64 |
|||
MaxNameLength int64 |
|||
MaxFieldNum int64 |
|||
MaxDimension int64 |
|||
DefaultPartitionName string |
|||
DefaultIndexName string |
|||
} |
|||
|
|||
var Params ParamTable |
|||
``` |
|||
|
|||
#### 5.4 Task |
|||
|
|||
```go |
|||
type task interface { |
|||
TraceCtx() context.Context |
|||
ID() UniqueID // return ReqID |
|||
SetID(uid UniqueID) // set ReqID |
|||
Name() string |
|||
Type() commonpb.MsgType |
|||
BeginTs() Timestamp |
|||
EndTs() Timestamp |
|||
SetTs(ts Timestamp) |
|||
OnEnqueue() error |
|||
PreExecute(ctx context.Context) error |
|||
Execute(ctx context.Context) error |
|||
PostExecute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
``` |
|||
|
|||
#### 5.5 Task Scheduler |
|||
|
|||
- Base Task Queue |
|||
|
|||
```go |
|||
type TaskQueue interface { |
|||
utChan() <-chan int |
|||
UTEmpty() bool |
|||
utFull() bool |
|||
addUnissuedTask(t task) error |
|||
FrontUnissuedTask() task |
|||
PopUnissuedTask() task |
|||
AddActiveTask(t task) |
|||
PopActiveTask(ts Timestamp) task |
|||
getTaskByReqID(reqID UniqueID) task |
|||
TaskDoneTest(ts Timestamp) bool |
|||
Enqueue(t task) error |
|||
} |
|||
|
|||
type baseTaskQueue struct { |
|||
unissuedTasks *list.List |
|||
activeTasks map[Timestamp]task |
|||
utLock sync.Mutex |
|||
atLock sync.Mutex |
|||
|
|||
maxTaskNum int64 |
|||
|
|||
utBufChan chan int |
|||
|
|||
sched *TaskScheduler |
|||
} |
|||
``` |
|||
|
|||
_AddUnissuedTask(task \*task)_ will push a new task into _unissuedTasks_, while maintaining the list by timestamp order. |
|||
|
|||
_TaskDoneTest(ts Timestamp)_ will check both _unissuedTasks_ and _unissuedTasks_. If no task found before _ts_, then the function returns _true_, indicates that all the tasks before _ts_ are completed. |
|||
|
|||
- Data Definition Task Queue |
|||
|
|||
```go |
|||
type ddTaskQueue struct { |
|||
baseTaskQueue |
|||
lock sync.Mutex |
|||
} |
|||
func (queue *ddTaskQueue) Enqueue(task *task) error |
|||
|
|||
func newDdTaskQueue() *ddTaskQueue |
|||
``` |
|||
|
|||
Data definition tasks (i.e. _CreateCollectionTask_) will be pushed into _DdTaskQueue_. If a task is enqueued, _Enqueue(task \*task)_ will set _Ts_, _ReqId_, _ProxyId_, then push it into _queue_. The timestamps of the enqueued tasks should be strictly monotonically increasing. As _Enqueue(task \*task)_ will be called in parallel, setting timestamp and queue insertion need to be done atomically. |
|||
|
|||
- Data Manipulation Task Queue |
|||
|
|||
```go |
|||
type dmTaskQueue struct { |
|||
baseTaskQueue |
|||
} |
|||
func (queue *dmTaskQueue) Enqueue(task *task) error |
|||
|
|||
func newDmTaskQueue() *dmTaskQueue |
|||
``` |
|||
|
|||
Insert tasks and delete tasks will be pushed into _DmTaskQueue_. |
|||
|
|||
If an _insertTask_ is enqueued, _Enqueue(task \*task)_ will set _Ts_, _ReqId_, _ProxyId_, _SegIdAssigner_, _RowIdAllocator_, then push it into _queue_. The _SegIdAssigner_ and _RowIdAllocator_ will later be used in the task's execution phase. |
|||
|
|||
- Data Query Task Queue |
|||
|
|||
```go |
|||
type dqTaskQueue struct { |
|||
baseTaskQueue |
|||
} |
|||
func (queue *dqTaskQueue) Enqueue(task *task) error |
|||
|
|||
func newDqTaskQueue() *dqTaskQueue |
|||
``` |
|||
|
|||
Queries will be pushed into _DqTaskQueue_. |
|||
|
|||
- Task Scheduler |
|||
|
|||
```go |
|||
type taskScheduler struct { |
|||
DdQueue TaskQueue |
|||
DmQueue TaskQueue |
|||
DqQueue TaskQueue |
|||
|
|||
idAllocator *allocator.IDAllocator |
|||
tsoAllocator *allocator.TimestampAllocator |
|||
|
|||
wg sync.WaitGroup |
|||
ctx context.Context |
|||
cancel context.CancelFunc |
|||
|
|||
msFactory msgstream.Factory |
|||
} |
|||
|
|||
func (sched *taskScheduler) scheduleDdTask() *task |
|||
func (sched *taskScheduler) scheduleDmTask() *task |
|||
func (sched *taskScheduler) scheduleDqTask() *task |
|||
func (sched *TaskScheduler) getTaskByReqID(collMeta UniqueID) task |
|||
func (sched *TaskScheduler) processTask(t task, q TaskQueue) |
|||
|
|||
func (sched *taskScheduler) Start() error |
|||
func (sched *taskScheduler) TaskDoneTest(ts Timestamp) bool |
|||
|
|||
func NewTaskScheduler(ctx context.Context, idAllocator *allocator.IDAllocator, tsoAllocator *allocator.TimestampAllocator, |
|||
factory msgstream.Factory) (*TaskScheduler, error) |
|||
``` |
|||
|
|||
_scheduleDdTask()_ selects tasks in a FIFO manner, thus time order is guaranteed. |
|||
|
|||
The policy of _scheduleDmTask()_ should target on throughput, not tasks' time order. Note that the time order of the tasks' execution will later be guaranteed by the timestamp & time tick mechanism. |
|||
|
|||
The policy of _scheduleDqTask()_ should target on throughput. It should also take visibility into consideration. For example, if an insert task and a query arrive in a same time tick and the query comes after insert, the query should be scheduled in the next tick thus the query can see the insert. |
|||
|
|||
_TaskDoneTest(ts Timestamp)_ will check all the three task queues. If no task is found before _ts_, then the function returns _true_, which indicates that all the tasks before _ts_ are completed. |
|||
|
|||
- Statistics |
|||
|
|||
// TODO |
|||
|
|||
```go |
|||
// ActiveComponent interfaces |
|||
func (sched *taskScheduler) Id() String |
|||
func (sched *taskScheduler) Status() Status |
|||
func (sched *taskScheduler) Clean() Status |
|||
func (sched *taskScheduler) Restart() Status |
|||
func (sched *taskScheduler) heartbeat() |
|||
|
|||
// protobuf |
|||
message taskSchedulerHeartbeat { |
|||
string id |
|||
uint64 dd_queue_length |
|||
uint64 dm_queue_length |
|||
uint64 dq_queue_length |
|||
uint64 num_dd_done |
|||
uint64 num_dm_done |
|||
uint64 num_dq_done |
|||
} |
|||
``` |
|||
|
|||
// TODO |
|||
|
|||
#### 5.6 Time Tick |
|||
|
|||
- Time Tick |
|||
|
|||
```go |
|||
type timeTick struct { |
|||
lastTick Timestamp |
|||
currentTick Timestamp |
|||
wallTick Timestamp |
|||
tickStep Timestamp |
|||
syncInterval Timestamp |
|||
|
|||
tsAllocator *TimestampAllocator |
|||
scheduler *taskScheduler |
|||
ttStream *MessageStream |
|||
|
|||
ctx context.Context |
|||
} |
|||
|
|||
func (tt *timeTick) Start() error |
|||
func (tt *timeTick) synchronize() error |
|||
|
|||
func newTimeTick(ctx context.Context, tickStep Timestamp, syncInterval Timestamp, tsAllocator *TimestampAllocator, scheduler *taskScheduler, ttStream *MessageStream) *timeTick |
|||
``` |
|||
|
|||
_Start()_ will enter a loop. On each _tickStep_, it tries to send a _TIME_TICK_ typed _TsMsg_ into _ttStream_. After each _syncInterval_, it synchronizes its _wallTick_ with _tsAllocator_ by calling _synchronize()_. When _currentTick + tickStep < wallTick_ holds, it will update _currentTick_ with _wallTick_ on next tick. Otherwise, it will update _currentTick_ with _currentTick + tickStep_. |
|||
|
|||
- Statistics |
|||
|
|||
```go |
|||
// ActiveComponent interfaces |
|||
func (tt *timeTick) ID() String |
|||
func (tt *timeTick) Status() Status |
|||
func (tt *timeTick) Clean() Status |
|||
func (tt *timeTick) Restart() Status |
|||
func (tt *timeTick) heartbeat() |
|||
|
|||
// protobuf |
|||
message TimeTickHeartbeat { |
|||
string id |
|||
uint64 last_tick |
|||
} |
|||
``` |
@ -0,0 +1,699 @@ |
|||
## 6. Root Coordinator |
|||
|
|||
<img src="./figs/root_coord.png"> |
|||
|
|||
#### 6.1 Root Coordinator Interface |
|||
|
|||
```go |
|||
type RootCoord interface { |
|||
Component |
|||
TimeTickProvider |
|||
|
|||
// DDL request |
|||
// CreateCollection notifies RootCoord to create a collection |
|||
CreateCollection(ctx context.Context, req *milvuspb.CreateCollectionRequest) (*commonpb.Status, error) |
|||
// DropCollection notifies RootCoord to drop a collection |
|||
DropCollection(ctx context.Context, req *milvuspb.DropCollectionRequest) (*commonpb.Status, error) |
|||
// HasCollection notifies RootCoord to check a collection's existence at specified timestamp |
|||
HasCollection(ctx context.Context, req *milvuspb.HasCollectionRequest) (*milvuspb.BoolResponse, error) |
|||
// DescribeCollection notifies RootCoord to get all information about this collection at specified timestamp |
|||
DescribeCollection(ctx context.Context, req *milvuspb.DescribeCollectionRequest) (*milvuspb.DescribeCollectionResponse, error) |
|||
// ShowCollections notifies RootCoord to list all collection names and other info in database at specified timestamp |
|||
ShowCollections(ctx context.Context, req *milvuspb.ShowCollectionsRequest) (*milvuspb.ShowCollectionsResponse, error) |
|||
// CreatePartition notifies RootCoord to create a partition |
|||
CreatePartition(ctx context.Context, req *milvuspb.CreatePartitionRequest) (*commonpb.Status, error) |
|||
// DropPartition notifies RootCoord to drop a partition |
|||
DropPartition(ctx context.Context, req *milvuspb.DropPartitionRequest) (*commonpb.Status, error) |
|||
// HasPartition notifies RootCoord to check if a partition with specified name exists in the collection |
|||
HasPartition(ctx context.Context, req *milvuspb.HasPartitionRequest) (*milvuspb.BoolResponse, error) |
|||
// ShowPartitions notifies RootCoord to list all partition names and other info in the collection |
|||
ShowPartitions(ctx context.Context, req *milvuspb.ShowPartitionsRequest) (*milvuspb.ShowPartitionsResponse, error) |
|||
|
|||
//index builder service |
|||
// CreateIndex notifies RootCoord to create an index for the specified field in the collection |
|||
CreateIndex(ctx context.Context, req *milvuspb.CreateIndexRequest) (*commonpb.Status, error) |
|||
// DescribeIndex notifies RootCoord to get specified index information for specified field |
|||
DescribeIndex(ctx context.Context, req *milvuspb.DescribeIndexRequest) (*milvuspb.DescribeIndexResponse, error) |
|||
// DropIndex notifies RootCoord to drop the specified index for the specified field |
|||
DropIndex(ctx context.Context, req *milvuspb.DropIndexRequest) (*commonpb.Status, error) |
|||
|
|||
//global timestamp allocator |
|||
// AllocTimestamp notifies RootCoord to alloc timestamps |
|||
AllocTimestamp(ctx context.Context, req *rootcoordpb.AllocTimestampRequest) (*rootcoordpb.AllocTimestampResponse, error) |
|||
// AllocID notifies RootCoord to alloc IDs |
|||
AllocID(ctx context.Context, req *rootcoordpb.AllocIDRequest) (*rootcoordpb.AllocIDResponse, error) |
|||
// UpdateChannelTimeTick notifies RootCoord to update each Proxy's safe timestamp |
|||
UpdateChannelTimeTick(ctx context.Context, req *internalpb.ChannelTimeTickMsg) (*commonpb.Status, error) |
|||
|
|||
//segment |
|||
// DescribeSegment notifies RootCoord to get specified segment information in the collection |
|||
DescribeSegment(ctx context.Context, req *milvuspb.DescribeSegmentRequest) (*milvuspb.DescribeSegmentResponse, error) |
|||
// ShowSegments notifies RootCoord to list all segment ids in the collection or partition |
|||
ShowSegments(ctx context.Context, req *milvuspb.ShowSegmentsRequest) (*milvuspb.ShowSegmentsResponse, error) |
|||
// ReleaseDQLMessageStream notifies RootCoord to release and close the search message stream of specific collection. |
|||
ReleaseDQLMessageStream(ctx context.Context, in *proxypb.ReleaseDQLMessageStreamRequest) (*commonpb.Status, error) |
|||
|
|||
// SegmentFlushCompleted notifies RootCoord that specified segment has been flushed |
|||
SegmentFlushCompleted(ctx context.Context, in *datapb.SegmentFlushCompletedMsg) (*commonpb.Status, error) |
|||
// GetMetrics notifies RootCoord to collect metrics for specified component |
|||
GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) |
|||
} |
|||
``` |
|||
|
|||
- _MsgBase_ |
|||
|
|||
```go |
|||
type MsgBase struct { |
|||
MsgType MsgType |
|||
MsgID UniqueID |
|||
Timestamp Timestamp |
|||
SourceID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _CreateCollection_ |
|||
|
|||
<img src="./figs/root_coord_create_collection.png"> |
|||
|
|||
```go |
|||
type CreateCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
Schema []byte |
|||
ShardsNum int32 |
|||
} |
|||
``` |
|||
|
|||
- _DropCollection_ |
|||
|
|||
```go |
|||
type DropCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
} |
|||
``` |
|||
|
|||
- _HasCollection_ |
|||
|
|||
```go |
|||
type HasCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
TimeStamp Timestamp |
|||
} |
|||
``` |
|||
|
|||
- _DescribeCollection_ |
|||
|
|||
```go |
|||
type DescribeCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
CollectionID UniqueID |
|||
TimeStamp Timestamp |
|||
} |
|||
|
|||
type CollectionSchema struct { |
|||
Name string |
|||
Description string |
|||
AutoID bool |
|||
Fields []*FieldSchema |
|||
} |
|||
|
|||
type DescribeCollectionResponse struct { |
|||
Status *commonpb.Status |
|||
Schema *schemapb.CollectionSchema |
|||
CollectionID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _ShowCollections_ |
|||
|
|||
```go |
|||
type ShowCollectionsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
Timestamp Timestamp |
|||
Type ShowCollectionsType |
|||
} |
|||
|
|||
type ShowCollectionResponse struct { |
|||
Status *commonpb.Status |
|||
CollectionNames []string |
|||
CollectionIds []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _CreatePartition_ |
|||
|
|||
```go |
|||
type CreatePartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
} |
|||
``` |
|||
|
|||
- _DropPartition_ |
|||
|
|||
```go |
|||
type DropPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
} |
|||
``` |
|||
|
|||
- _HasPartition_ |
|||
|
|||
```go |
|||
type HasPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
} |
|||
``` |
|||
|
|||
- _ShowPartitions_ |
|||
|
|||
```go |
|||
type ShowPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
CollectionID UniqueID |
|||
} |
|||
|
|||
type ShowPartitionResponse struct { |
|||
Status *commonpb.Status |
|||
PartitionNames []string |
|||
PartitionIDs []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _DescribeSegment_ |
|||
|
|||
```go |
|||
type DescribeSegmentRequest struct { |
|||
Base *commonpb.MsgBase |
|||
CollectionID UniqueID |
|||
SegmentID UniqueID |
|||
} |
|||
|
|||
type DescribeSegmentResponse struct { |
|||
Status *commonpb.Status |
|||
IndexID UniqueID |
|||
BuildID UniqueID |
|||
EnableIndex bool |
|||
} |
|||
``` |
|||
|
|||
- _ShowSegments_ |
|||
|
|||
```go |
|||
type ShowSegmentsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
} |
|||
|
|||
type ShowSegmentsResponse struct { |
|||
Status *commonpb.Status |
|||
SegmentIDs []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _ReleaseDQLMessageStream_ |
|||
|
|||
```go |
|||
type ReleaseDQLMessageStreamRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
} |
|||
|
|||
``` |
|||
|
|||
- _CreateIndex_ |
|||
<img src="./figs/root_coord_create_index.png"> |
|||
|
|||
```go |
|||
type CreateIndexRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
FieldName string |
|||
ExtraParams []*commonpb.KeyValuePair |
|||
} |
|||
``` |
|||
|
|||
- _DescribeIndex_ |
|||
|
|||
```go |
|||
type DescribeIndexRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
FieldName string |
|||
IndexName string |
|||
} |
|||
|
|||
type IndexDescription struct { |
|||
IndexName string |
|||
IndexID UniqueID |
|||
Params []*commonpb.KeyValuePair |
|||
FieldName string |
|||
} |
|||
|
|||
type DescribeIndexResponse struct { |
|||
Status *commonpb.Status |
|||
IndexDescriptions []*IndexDescription |
|||
} |
|||
``` |
|||
|
|||
- _DropIndex_ |
|||
|
|||
```go |
|||
type DropIndexRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
FieldName string |
|||
IndexName string |
|||
} |
|||
``` |
|||
|
|||
- _AllocTimestamp_ |
|||
|
|||
```go |
|||
type AllocTimestampRequest struct { |
|||
Base *commonpb.MsgBase |
|||
Count uint32 |
|||
} |
|||
|
|||
type AllocTimestampResponse struct { |
|||
Status *commonpb.Status |
|||
Timestamp UniqueID |
|||
Count uint32 |
|||
} |
|||
``` |
|||
|
|||
- _AllocID_ |
|||
|
|||
```go |
|||
type AllocIDRequest struct { |
|||
Base *commonpb.MsgBase |
|||
Count uint32 |
|||
} |
|||
|
|||
type AllocIDResponse struct { |
|||
Status *commonpb.Status |
|||
ID UniqueID |
|||
Count uint32 |
|||
} |
|||
``` |
|||
|
|||
- _UpdateChannelTimeTick_ |
|||
|
|||
```go |
|||
type ChannelTimeTickMsg struct { |
|||
Base *commonpb.MsgBase |
|||
ChannelNames []string |
|||
Timestamps []Timestamp |
|||
DefaultTimestamp Timestamp |
|||
} |
|||
``` |
|||
|
|||
#### 6.2 Dd (Data definitions) Message |
|||
|
|||
`RootCoord` would put `Dd Message` into the `DML MsgSteams` |
|||
|
|||
- _BaseMsg_ |
|||
|
|||
```go |
|||
type BaseMsg struct { |
|||
Ctx context.Context |
|||
BeginTimestamp Timestamp |
|||
EndTimestamp Timestamp |
|||
HashValues []uint32 |
|||
MsgPosition *MsgPosition |
|||
} |
|||
``` |
|||
|
|||
- _CreateCollectionMsg_ |
|||
|
|||
```go |
|||
type CreateCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
Schema []byte |
|||
VirtualChannelNames []string |
|||
PhysicalChannelNames []string |
|||
} |
|||
|
|||
type CreateCollectionMsg struct { |
|||
BaseMsg |
|||
msgpb.CreateCollectionRequest |
|||
} |
|||
``` |
|||
|
|||
- _DropCollectionMsg_ |
|||
|
|||
```go |
|||
type DropCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
} |
|||
|
|||
type DropCollectionMsg struct { |
|||
BaseMsg |
|||
DropCollectionRequest |
|||
} |
|||
``` |
|||
|
|||
- _CreatePartitionMsg_ |
|||
|
|||
```go |
|||
type CreatePartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
} |
|||
|
|||
type CreatePartitionMsg struct { |
|||
BaseMsg |
|||
CreatePartitionRequest |
|||
} |
|||
``` |
|||
|
|||
- _DropPartitionMsg_ |
|||
|
|||
```go |
|||
type DropPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
} |
|||
|
|||
type DropPartitionMsg struct { |
|||
BaseMsg |
|||
DropPartitionRequest |
|||
} |
|||
``` |
|||
|
|||
#### 6.3 Create Index automatically |
|||
|
|||
`RootCoord` would notify `IndexCoord(Index Coordinator)` to build index automatically when the segment has been flushed. |
|||
<img src="./figs/root_coord_create_index_automatically.png"> |
|||
|
|||
#### 6.4 RootCoord Instance |
|||
|
|||
```go |
|||
type Core struct { |
|||
MetaTable *metaTable |
|||
//id allocator |
|||
IDAllocator func(count uint32) (typeutil.UniqueID, typeutil.UniqueID, error) |
|||
IDAllocatorUpdate func() error |
|||
|
|||
//tso allocator |
|||
TSOAllocator func(count uint32) (typeutil.Timestamp, error) |
|||
TSOAllocatorUpdate func() error |
|||
|
|||
//inner members |
|||
ctx context.Context |
|||
cancel context.CancelFunc |
|||
etcdCli *clientv3.Client |
|||
kvBase *etcdkv.etcdKV |
|||
|
|||
//setMsgStreams, send time tick into dd channel and time tick channel |
|||
SendTimeTick func(t typeutil.Timestamp) error |
|||
|
|||
//setMsgStreams, send create collection into dd channel |
|||
SendDdCreateCollectionReq func(ctx context.Context, req *msgpb.CreateCollectionRequest, channelNames []string) error |
|||
|
|||
//setMsgStreams, send drop collection into dd channel, and notify the proxy to delete this collection |
|||
SendDdDropCollectionReq func(ctx context.Context, req *msgpb.DropCollectionRequest, channelNames []string) error |
|||
|
|||
//setMsgStreams, send create partition into dd channel |
|||
SendDdCreatePartitionReq func(ctx context.Context, req *msgpb.CreatePartitionRequest, channelNames []string) error |
|||
|
|||
//setMsgStreams, send drop partition into dd channel |
|||
SendDdDropPartitionReq func(ctx context.Context, req *msgpb.DropPartitionRequest, channelNames []string) error |
|||
|
|||
// if RootCoord create segment, DataCoord will put segment msg into this channel |
|||
DataCoordSegmentChan <-chan *ms.MsgPack |
|||
|
|||
// if segment flush completed, DataNode would put segment msg into this channel |
|||
DataNodeFlushedSegmentChan <-chan *ms.MsgPack |
|||
|
|||
//get binlog file path from data service, |
|||
CallGetBinlogFilePathsService func(segID typeutil.UniqueID, fieldID typeutil.UniqueID) ([]string, error) |
|||
CallGetNumRowsService func(segID typeutil.UniqueID, isFromFlushedChan bool) (int64, error) |
|||
|
|||
//call index builder's client to build index, return build id |
|||
CallBuildIndexService func(ctx context.Context, binlog []string, field *schemapb.FieldSchema, idxInfo *etcdpb.IndexInfo) (typeutil.UniqueID, error) |
|||
CallDropIndexService func(ctx context.Context, indexID typeutil.UniqueID) error |
|||
|
|||
NewProxyClient func(sess *sessionutil.Session) (types.Proxy, error) |
|||
|
|||
//query service interface, notify query service to release collection |
|||
CallReleaseCollectionService func(ctx context.Context, ts typeutil.Timestamp, dbID typeutil.UniqueID, collectionID typeutil.UniqueID) error |
|||
|
|||
//dd request scheduler |
|||
ddReqQueue chan reqTask //dd request will be push into this chan |
|||
|
|||
//dml channels |
|||
dmlChannels *dmlChannels |
|||
|
|||
//Proxy manager |
|||
proxyManager *proxyManager |
|||
|
|||
// proxy clients |
|||
proxyClientManager *proxyClientManager |
|||
|
|||
// channel timetick |
|||
chanTimeTick *timetickSync |
|||
|
|||
//time tick loop |
|||
lastTimeTick typeutil.Timestamp |
|||
|
|||
//states code |
|||
stateCode atomic.Value |
|||
|
|||
//call once |
|||
initOnce sync.Once |
|||
startOnce sync.Once |
|||
//isInit atomic.Value |
|||
|
|||
session *sessionutil.Session |
|||
sessCloseCh <-chan bool |
|||
|
|||
msFactory ms.Factory |
|||
} |
|||
``` |
|||
|
|||
#### 6.5 Data definition Request Scheduler |
|||
|
|||
###### 6.5.1 Task |
|||
|
|||
RootCoord receives data definition requests via grpc. Each request (described by a proto) will be wrapped as a task for further scheduling. The task interface is |
|||
|
|||
```go |
|||
type reqTask interface { |
|||
Ctx() context.Context |
|||
Type() commonpb.MsgType |
|||
Execute(ctx context.Context) error |
|||
WaitToFinish() error |
|||
Notify(err error) |
|||
} |
|||
``` |
|||
|
|||
A task example is as follows. In this example, we wrap a CreateCollectionRequest (a proto) as a createCollectionTask. The wrapper need to implement task interfaces. |
|||
|
|||
```go |
|||
type CreateCollectionReqTask struct { |
|||
baseReqTask |
|||
Req *milvuspb.CreateCollectionRequest |
|||
} |
|||
|
|||
// Task interfaces |
|||
func (t *CreateCollectionReqTask) Ctx() context.Context |
|||
func (t *CreateCollectionReqTask) Type() commonpb.MsgType |
|||
func (t *CreateCollectionReqTask) Execute(ctx context.Context) error |
|||
func (t *CreateCollectionReqTask) WaitToFinish() error |
|||
func (t *CreateCollectionReqTask) Notify(err error) |
|||
``` |
|||
|
|||
In most cases, a data definition task need to |
|||
|
|||
- update system's meta data (via $metaTable$), |
|||
- send `DD Message` into related `DML MsgStream`, so that the `Data Node` and `Query Node` would take it |
|||
|
|||
#### 6.6 Meta Table |
|||
|
|||
###### 6.6.1 Meta |
|||
|
|||
- Proxy Meta |
|||
|
|||
```protobuf |
|||
message ProxyMeta { |
|||
uint64 id = 1; |
|||
common.Address address = 2; |
|||
repeated string result_channel_names = 3; |
|||
} |
|||
``` |
|||
|
|||
- Collection Meta |
|||
|
|||
```protobuf |
|||
message PartitionInfo { |
|||
string partition_name = 1; |
|||
int64 partitionID = 2; |
|||
repeated int64 segmentIDs = 3; |
|||
} |
|||
|
|||
message IndexInfo { |
|||
string index_name = 1; |
|||
int64 indexID = 2; |
|||
repeated common.KeyValuePair index_params = 3; |
|||
} |
|||
|
|||
message FieldIndexInfo{ |
|||
int64 filedID = 1; |
|||
int64 indexID = 2; |
|||
} |
|||
|
|||
message CollectionInfo { |
|||
int64 ID = 1; |
|||
schema.CollectionSchema schema = 2; |
|||
uint64 create_time = 3; |
|||
repeated int64 partitionIDs = 4; |
|||
repeated FieldIndexInfo field_indexes = 5; |
|||
repeated string virtual_channel_names = 6; |
|||
repeated string physical_channel_names = 7; |
|||
} |
|||
``` |
|||
|
|||
- Segment Meta |
|||
|
|||
```protobuf |
|||
message SegmentIndexInfo { |
|||
int64 segmentID = 1; |
|||
int64 fieldID = 2; |
|||
int64 indexID = 3; |
|||
int64 buildID = 4; |
|||
bool enable_index = 5; |
|||
} |
|||
``` |
|||
|
|||
###### 6.6.2 KV pairs in etcdKV |
|||
|
|||
```go |
|||
"proxy/$proxyId" string -> proxyMetaBlob string |
|||
"collection/$collectionId" string -> collectionInfoBlob string |
|||
"partition/$collectionId/$partitionId" string -> partitionInfoBlob string |
|||
"index/$collectionId/$indexId" string -> IndexInfoBlob string |
|||
"segment-index/$collectionId/$indexId/$partitionId/$segmentId" -> segmentIndexInfoBlog string |
|||
``` |
|||
|
|||
Note that _tenantId_, _proxyId_, _collectionId_, _partitionId_, _indexId_, _segmentId_ are unique strings converted from int64. |
|||
|
|||
_proxyMetaBlob_, _collectionInfoBlob_, _partitionInfoBlob_, _IndexInfoBlob_, _segmentIndexInfoBlog_ are serialized protos. |
|||
|
|||
###### 6.6.3 Meta Table |
|||
|
|||
```go |
|||
type metaTable struct { |
|||
txn kv.TxnKV // client of a reliable txnkv service, i.e. etcd client |
|||
snapshot kv.SnapShotKV // client of a reliable snapshotkv service, i.e. etcd client |
|||
collID2Meta map[typeutil.UniqueID]pb.CollectionInfo // collection_id -> meta |
|||
collName2ID map[string]typeutil.UniqueID // collection name to collection id |
|||
collAlias2ID map[string]typeutil.UniqueID // collection alias to collection id |
|||
partID2SegID map[typeutil.UniqueID]map[typeutil.UniqueID]bool // partition_id -> segment_id -> bool |
|||
segID2IndexMeta map[typeutil.UniqueID]map[typeutil.UniqueID]pb.SegmentIndexInfo // collection_id/index_id/partition_id/segment_id -> meta |
|||
indexID2Meta map[typeutil.UniqueID]pb.IndexInfo // collection_id/index_id -> meta |
|||
|
|||
proxyLock sync.RWMutex |
|||
ddLock sync.RWMutex |
|||
} |
|||
|
|||
func NewMetaTable(kv kv.SnapShotKV) (*metaTable, error) |
|||
|
|||
func (mt *metaTable) AddCollection(coll *pb.CollectionInfo, part *pb.PartitionInfo, idx []*pb.IndexInfo, ddOpStr func(ts typeutil.Timestamp) (string, error)) (typeutil.Timestamp, error) |
|||
func (mt *metaTable) DeleteCollection(collID typeutil.UniqueID, ddOpStr func(ts typeutil.Timestamp) (string, error)) (typeutil.Timestamp, error) |
|||
func (mt *metaTable) HasCollection(collID typeutil.UniqueID, ts typeutil.Timestamp) bool |
|||
func (mt *metaTable) GetCollectionByID(collectionID typeutil.UniqueID, ts typeutil.Timestamp) (*pb.CollectionInfo, error) |
|||
func (mt *metaTable) GetCollectionByName(collectionName string, ts typeutil.Timestamp) (*pb.CollectionInfo, error) |
|||
func (mt *metaTable) GetCollectionBySegmentID(segID typeutil.UniqueID) (*pb.CollectionInfo, error) |
|||
func (mt *metaTable) ListCollections(ts typeutil.Timestamp) (map[string]typeutil.UniqueID, error) |
|||
func (mt *metaTable) ListCollectionVirtualChannels() []string |
|||
func (mt *metaTable) ListCollectionPhysicalChannels() []string |
|||
func (mt *metaTable) AddPartition(collID typeutil.UniqueID, partitionName string, partitionID typeutil.UniqueID, ddOpStr func(ts typeutil.Timestamp) (string, error)) (typeutil.Timestamp, error) |
|||
func (mt *metaTable) GetPartitionByName(collID typeutil.UniqueID, partitionName string, ts typeutil.Timestamp) (pb.PartitionInfo, error) |
|||
func (mt *metaTable) HasPartition(collID typeutil.UniqueID, partitionName string, ts typeutil.Timestamp) bool |
|||
func (mt *metaTable) DeletePartition(collID typeutil.UniqueID, partitionName string, ddOpStr func(ts typeutil.Timestamp) (string, error)) (typeutil.Timestamp, typeutil.UniqueID, error) |
|||
func (mt *metaTable) GetPartitionByID(collID typeutil.UniqueID, partitionID typeutil.UniqueID, ts typeutil.Timestamp) (pb.PartitionInfo, error) |
|||
func (mt *metaTable) AddSegment(segInfos []*datapb.SegmentInfo, msgStartPos string, msgEndPos string) (typeutil.Timestamp, error) |
|||
func (mt *metaTable) AddIndex(segIdxInfos []*pb.SegmentIndexInfo, msgStartPos string, msgEndPos string) (typeutil.Timestamp, error) |
|||
func (mt *metaTable) DropIndex(collName, fieldName, indexName string) (typeutil.Timestamp, typeutil.UniqueID, bool, error) |
|||
func (mt *metaTable) GetSegmentIndexInfoByID(segID typeutil.UniqueID, filedID int64, idxName string) (pb.SegmentIndexInfo, error) |
|||
func (mt *metaTable) GetFieldSchema(collName string, fieldName string) (schemapb.FieldSchema, error) |
|||
func (mt *metaTable) IsSegmentIndexed(segID typeutil.UniqueID, fieldSchema *schemapb.FieldSchema, indexParams []*commonpb.KeyValuePair) bool |
|||
func (mt *metaTable) GetNotIndexedSegments(collName string, fieldName string, idxInfo *pb.IndexInfo) ([]typeutil.UniqueID, schemapb.FieldSchema, error) |
|||
func (mt *metaTable) GetIndexByName(collName, indexName string) (pb.CollectionInfo, []pb.IndexInfo, error) |
|||
func (mt *metaTable) GetIndexByID(indexID typeutil.UniqueID) (*pb.IndexInfo, error) |
|||
func (mt *metaTable) AddFlushedSegment(segID typeutil.UniqueID) error |
|||
``` |
|||
|
|||
- _metaTable_ maintains meta both in memory and _etcdKV_. It keeps meta's consistency in both sides. All its member functions may be called concurrently. |
|||
|
|||
- for _HasCollection_, _GetCollectionByID_, _GetCollectionByName_, _ListCollections_, if the argument of `ts` is none-zero, then _metaTable_ would return the meta on the timestamp of `ts`; if `ts` is zero, _metaTable_ would return the latest meta |
|||
|
|||
#### 6.7 System Time Synchronization |
|||
|
|||
<img src="./figs/root_coord_time_sync.png"> |
|||
|
|||
```go |
|||
type timetickSync struct { |
|||
core *Core |
|||
lock sync.Mutex |
|||
proxyTimeTick map[typeutil.UniqueID]*channelTimeTickMsg |
|||
sendChan chan map[typeutil.UniqueID]*channelTimeTickMsg |
|||
|
|||
// record ddl timetick info |
|||
ddlLock sync.RWMutex |
|||
ddlMinTs typeutil.Timestamp |
|||
ddlTsSet map[typeutil.Timestamp]struct{} |
|||
} |
|||
|
|||
func newTimeTickSync(core *Core) *timetickSync |
|||
|
|||
func (t *timetickSync) UpdateTimeTick(in *internalpb.ChannelTimeTickMsg) error |
|||
func (t *timetickSync) DelProxy(sess *sessionutil.Session) |
|||
func (t *timetickSync) GetProxy(sess []*sessionutil.Session) |
|||
func (t *timetickSync) StartWatch() |
|||
func (t *timetickSync) SendChannelTimeTick(chanName string, ts typeutil.Timestamp) error |
|||
func (t *timetickSync) GetProxyNum() |
|||
func (t *timetickSync) GetChanNum() int |
|||
``` |
@ -0,0 +1,496 @@ |
|||
## 7. Query Coordinator |
|||
|
|||
#### 7.1 Overview |
|||
|
|||
<img src="./figs/query_coord.png" width=500> |
|||
|
|||
#### 7.2 Query Coordinator Interface |
|||
|
|||
```go |
|||
type QueryCoord interface { |
|||
Component |
|||
TimeTickProvider |
|||
|
|||
// ShowCollections notifies RootCoord to list all collection names and other info in database at specified timestamp |
|||
ShowCollections(ctx context.Context, req *querypb.ShowCollectionsRequest) (*querypb.ShowCollectionsResponse, error) |
|||
// LoadCollection notifies Proxy to load a collection's data |
|||
LoadCollection(ctx context.Context, req *querypb.LoadCollectionRequest) (*commonpb.Status, error) |
|||
// ReleaseCollection notifies Proxy to release a collection's data |
|||
ReleaseCollection(ctx context.Context, req *querypb.ReleaseCollectionRequest) (*commonpb.Status, error) |
|||
// ShowPartitions notifies RootCoord to list all partition names and other info in the collection |
|||
ShowPartitions(ctx context.Context, req *querypb.ShowPartitionsRequest) (*querypb.ShowPartitionsResponse, error) |
|||
// LoadPartitions notifies Proxy to load partition's data |
|||
LoadPartitions(ctx context.Context, req *querypb.LoadPartitionsRequest) (*commonpb.Status, error) |
|||
// ReleasePartitions notifies Proxy to release collection's data |
|||
ReleasePartitions(ctx context.Context, req *querypb.ReleasePartitionsRequest) (*commonpb.Status, error) |
|||
// CreateQueryChannel creates the channels for querying in QueryCoord. |
|||
CreateQueryChannel(ctx context.Context) (*querypb.CreateQueryChannelResponse, error) |
|||
GetPartitionStates(ctx context.Context, req *querypb.GetPartitionStatesRequest) (*querypb.GetPartitionStatesResponse, error) |
|||
// GetSegmentInfo requests segment info |
|||
GetSegmentInfo(ctx context.Context, req *querypb.GetSegmentInfoRequest) (*querypb.GetSegmentInfoResponse, error) |
|||
// GetMetrics gets the metrics about QueryCoord. |
|||
GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) |
|||
} |
|||
``` |
|||
|
|||
- _MsgBase_ |
|||
|
|||
```go |
|||
type MsgBase struct { |
|||
MsgType MsgType |
|||
MsgID UniqueID |
|||
Timestamp Timestamp |
|||
SourceID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _ShowCollections_ |
|||
|
|||
```go |
|||
type ShowCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionIDs []int64 |
|||
} |
|||
|
|||
type ShowCollectionResponse struct { |
|||
Status *commonpb.Status |
|||
CollectionIDs []UniqueID |
|||
InMemoryPercentages []int64 |
|||
} |
|||
``` |
|||
|
|||
- _LoadCollection_ |
|||
|
|||
```go |
|||
type LoadCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
schema *schemapb.CollectionSchema |
|||
} |
|||
``` |
|||
|
|||
- _ReleaseCollection_ |
|||
|
|||
```go |
|||
type ReleaseCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _ShowPartitions_ |
|||
|
|||
```go |
|||
type ShowPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []int64 |
|||
} |
|||
|
|||
type ShowPartitionResponse struct { |
|||
Status *commonpb.Status |
|||
PartitionIDs []UniqueID |
|||
InMemoryPercentages []int64 |
|||
} |
|||
``` |
|||
|
|||
- _GetPartitionStates_ |
|||
|
|||
```go |
|||
type PartitionState = int |
|||
|
|||
const ( |
|||
PartitionState_NotExist PartitionState = 0 |
|||
PartitionState_NotPresent PartitionState = 1 |
|||
PartitionState_OnDisk PartitionState = 2 |
|||
PartitionState_PartialInMemory PartitionState = 3 |
|||
PartitionState_InMemory PartitionState = 4 |
|||
PartitionState_PartialInGPU PartitionState = 5 |
|||
PartitionState_InGPU PartitionState = 6 |
|||
) |
|||
|
|||
type PartitionStatesRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
} |
|||
|
|||
type PartitionStates struct { |
|||
PartitionID UniqueID |
|||
State PartitionState |
|||
} |
|||
|
|||
type PartitionStatesResponse struct { |
|||
Status *commonpb.Status |
|||
PartitionDescriptions []*PartitionStates |
|||
} |
|||
``` |
|||
|
|||
- _LoadPartitions_ |
|||
|
|||
```go |
|||
type LoadPartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
Schema *schemapb.CollectionSchema |
|||
} |
|||
``` |
|||
|
|||
- _ReleasePartitions_ |
|||
|
|||
```go |
|||
type ReleasePartitionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _CreateQueryChannel_ |
|||
|
|||
```go |
|||
type CreateQueryChannelResponse struct { |
|||
Status *commonpb.Status |
|||
RequestChannelName string |
|||
ResultChannelName string |
|||
} |
|||
``` |
|||
|
|||
- _GetSegmentInfo_ \* |
|||
|
|||
```go |
|||
type GetSegmentInfoRequest struct { |
|||
Base *commonpb.MsgBase |
|||
SegmentIDs []UniqueID |
|||
} |
|||
|
|||
type SegmentInfo struct { |
|||
SegmentID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
MemSize UniqueID |
|||
NumRows UniqueID |
|||
IndexName string |
|||
IndexID UniqueID |
|||
} |
|||
|
|||
type GetSegmentInfoResponse struct { |
|||
Status *commonpb.Status |
|||
Infos []*SegmentInfo |
|||
} |
|||
``` |
|||
|
|||
#### 7.3 Query Channel |
|||
|
|||
- _SearchMsg_ |
|||
|
|||
```go |
|||
type SearchRequest struct { |
|||
Base *commonpb.MsgBase |
|||
ResultChannelID string |
|||
DbID int64 |
|||
CollectionID int64 |
|||
PartitionIDs []int64 |
|||
Dsl string |
|||
PlaceholderGroup []byte |
|||
DslType commonpb.DslType |
|||
SerializedExprPlan []byte |
|||
OutputFieldsId []int64 |
|||
TravelTimestamp uint64 |
|||
GuaranteeTimestamp uint64 |
|||
} |
|||
|
|||
type SearchMsg struct { |
|||
BaseMsg |
|||
SearchRequest |
|||
} |
|||
``` |
|||
|
|||
- _RetrieveMsg_ |
|||
|
|||
```go |
|||
type RetrieveRequest struct { |
|||
Base *commonpb.MsgBase |
|||
ResultChannelID string |
|||
DbID int64 |
|||
CollectionID int64 |
|||
PartitionIDs []int64 |
|||
SerializedExprPlan []byte |
|||
OutputFieldsId []int64 |
|||
TravelTimestamp uint64 |
|||
GuaranteeTimestamp uint64 |
|||
} |
|||
|
|||
type RetrieveMsg struct { |
|||
BaseMsg |
|||
RetrieveRequest |
|||
} |
|||
``` |
|||
|
|||
#### 7.4 Query Node Interface |
|||
|
|||
```go |
|||
type QueryNode interface { |
|||
Component |
|||
TimeTickProvider |
|||
|
|||
// AddQueryChannel notifies QueryNode to subscribe a query channel and be a producer of a query result channel. |
|||
AddQueryChannel(ctx context.Context, req *querypb.AddQueryChannelRequest) (*commonpb.Status, error) |
|||
// RemoveQueryChannel removes the query channel for QueryNode component. |
|||
RemoveQueryChannel(ctx context.Context, req *querypb.RemoveQueryChannelRequest) (*commonpb.Status, error) |
|||
// WatchDmChannels watches the channels about data manipulation. |
|||
WatchDmChannels(ctx context.Context, req *querypb.WatchDmChannelsRequest) (*commonpb.Status, error) |
|||
// LoadSegments notifies QueryNode to load the sealed segments from storage. The load tasks are sync to this |
|||
// rpc, QueryNode will return after all the sealed segments are loaded. |
|||
LoadSegments(ctx context.Context, req *querypb.LoadSegmentsRequest) (*commonpb.Status, error) |
|||
// ReleaseCollection notifies Proxy to release a collection's data |
|||
ReleaseCollection(ctx context.Context, req *querypb.ReleaseCollectionRequest) (*commonpb.Status, error) |
|||
// ReleasePartitions notifies Proxy to release partitions' data |
|||
ReleasePartitions(ctx context.Context, req *querypb.ReleasePartitionsRequest) (*commonpb.Status, error) |
|||
// ReleaseSegments releases the data of the specified segments in QueryNode. |
|||
ReleaseSegments(ctx context.Context, req *querypb.ReleaseSegmentsRequest) (*commonpb.Status, error) |
|||
// GetSegmentInfo requests segment info |
|||
GetSegmentInfo(ctx context.Context, req *querypb.GetSegmentInfoRequest) (*querypb.GetSegmentInfoResponse, error) |
|||
// GetMetrics gets the metrics about QueryNode. |
|||
GetMetrics(ctx context.Context, in *milvuspb.GetMetricsRequest, opts ...grpc.CallOption) (*milvuspb.GetMetricsResponse, error) |
|||
} |
|||
``` |
|||
|
|||
- _AddQueryChannel_ |
|||
|
|||
```go |
|||
type AddQueryChannelRequest struct { |
|||
Base *commonpb.MsgBase |
|||
NodeID int64 |
|||
CollectionID int64 |
|||
RequestChannelID string |
|||
ResultChannelID string |
|||
} |
|||
``` |
|||
|
|||
- _RemoveQueryChannel_ |
|||
|
|||
```go |
|||
type RemoveQueryChannelRequest struct { |
|||
Base *commonpb.MsgBase |
|||
NodeID int64 |
|||
CollectionID int64 |
|||
RequestChannelID string |
|||
ResultChannelID string |
|||
} |
|||
``` |
|||
|
|||
- _WatchDmChannels_ |
|||
|
|||
```go |
|||
|
|||
type WatchDmChannelsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
NodeID int64 |
|||
CollectionID int64 |
|||
PartitionID int64 |
|||
Infos []*datapb.VchannelInfo |
|||
Schema *schemapb.CollectionSchema |
|||
ExcludeInfos []*datapb.SegmentInfo |
|||
} |
|||
``` |
|||
|
|||
- _LoadSegments_ |
|||
|
|||
```go |
|||
type LoadSegmentsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
NodeID int64 |
|||
Infos []*SegmentLoadInfo |
|||
Schema *schemapb.CollectionSchema |
|||
LoadCondition TriggerCondition |
|||
} |
|||
``` |
|||
|
|||
- _ReleaseCollection_ |
|||
|
|||
```go |
|||
type ReleaseCollectionRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
NodeID int64 |
|||
} |
|||
``` |
|||
|
|||
- _ReleasePartitions_ |
|||
|
|||
```go |
|||
type ReleasePartitionsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
NodeID int64 |
|||
} |
|||
``` |
|||
|
|||
- _ReleaseSegments_ |
|||
|
|||
```go |
|||
type ReleaseSegmentsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
NodeID int64 |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionIDs []UniqueID |
|||
SegmentIDs []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _GetSegmentInfo_ |
|||
|
|||
```go |
|||
type GetSegmentInfoRequest struct { |
|||
Base *commonpb.MsgBase |
|||
SegmentIDs []Unique |
|||
} |
|||
|
|||
type GetSegmentInfoResponse struct { |
|||
Status *commonpb.Status |
|||
Infos []*SegmentInfo |
|||
} |
|||
``` |
|||
|
|||
//TODO |
|||
|
|||
#### 7.5 Collection Replica |
|||
|
|||
$collectionReplica$ contains an in-memory local copy of persistent collections. In common cases, the system has multiple query nodes. Data of a collection will be distributed across all the available query nodes, and each query node's $collectionReplica$ will maintain its own share (only part of the collection). |
|||
Every replica tracks a value called tSafe which is the maximum timestamp that the replica is up-to-date. |
|||
|
|||
- _Collection_ |
|||
|
|||
```go |
|||
type collectionReplica struct { |
|||
tSafes map[UniqueID]tSafer // map[collectionID]tSafer |
|||
|
|||
mu sync.RWMutex // guards all |
|||
collections map[UniqueID]*Collection |
|||
partitions map[UniqueID]*Partition |
|||
segments map[UniqueID]*Segment |
|||
|
|||
excludedSegments map[UniqueID][]*datapb.SegmentInfo // map[collectionID]segmentIDs |
|||
} |
|||
``` |
|||
|
|||
- _Collection_ |
|||
|
|||
```go |
|||
type FieldSchema struct { |
|||
FieldID int64 |
|||
Name string |
|||
IsPrimaryKey bool |
|||
Description string |
|||
DataType DataType |
|||
TypeParams []*commonpb.KeyValuePair |
|||
IndexParams []*commonpb.KeyValuePair |
|||
} |
|||
|
|||
type CollectionSchema struct { |
|||
Name string |
|||
Description string |
|||
AutoID bool |
|||
Fields []*FieldSchema |
|||
} |
|||
|
|||
type Collection struct { |
|||
collectionPtr C.CCollection |
|||
id UniqueID |
|||
partitionIDs []UniqueID |
|||
schema *schemapb.CollectionSchema |
|||
vChannels []Channel |
|||
pChannels []Channel |
|||
loadType loadType |
|||
|
|||
releaseMu sync.RWMutex |
|||
releasedPartitions map[UniqueID]struct{} |
|||
releaseTime Timestamp |
|||
} |
|||
``` |
|||
|
|||
- _Partition_ |
|||
|
|||
```go |
|||
type Partition struct { |
|||
collectionID UniqueID |
|||
partitionID UniqueID |
|||
segmentIDs []UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _Segment_ |
|||
|
|||
```go |
|||
type segmentType int32 |
|||
|
|||
const ( |
|||
segmentTypeInvalid segmentType = iota |
|||
segmentTypeGrowing |
|||
segmentTypeSealed |
|||
segmentTypeIndexing |
|||
) |
|||
type indexParam = map[string]string |
|||
|
|||
type Segment struct { |
|||
segmentPtr C.CSegmentInterface |
|||
|
|||
segmentID UniqueID |
|||
partitionID UniqueID |
|||
collectionID UniqueID |
|||
|
|||
onService bool |
|||
|
|||
vChannelID Channel |
|||
lastMemSize int64 |
|||
lastRowCount int64 |
|||
|
|||
once sync.Once // guards enableIndex |
|||
enableIndex bool |
|||
|
|||
rmMutex sync.Mutex // guards recentlyModified |
|||
recentlyModified bool |
|||
|
|||
typeMu sync.Mutex // guards builtIndex |
|||
segmentType segmentType |
|||
|
|||
paramMutex sync.RWMutex // guards index |
|||
indexInfos map[FieldID]*indexInfo |
|||
|
|||
idBinlogRowSizes []int64 |
|||
|
|||
vectorFieldMutex sync.RWMutex // guards vectorFieldInfos |
|||
vectorFieldInfos map[UniqueID]*VectorFieldInfo |
|||
|
|||
pkFilter *bloom.BloomFilter // bloom filter of pk inside a segment |
|||
} |
|||
``` |
|||
|
|||
- _Data Sync Service_ |
|||
|
|||
```go |
|||
type dataSyncService struct { |
|||
ctx context.Context |
|||
|
|||
mu sync.Mutex // guards FlowGraphs |
|||
collectionFlowGraphs map[UniqueID]map[Channel]*queryNodeFlowGraph // map[collectionID]flowGraphs |
|||
partitionFlowGraphs map[UniqueID]map[Channel]*queryNodeFlowGraph // map[partitionID]flowGraphs |
|||
|
|||
streamingReplica ReplicaInterface |
|||
tSafeReplica TSafeReplicaInterface |
|||
msFactory msgstream.Factory |
|||
} |
|||
``` |
@ -0,0 +1,205 @@ |
|||
## 8 Binlog |
|||
|
|||
InsertBinlog、DeleteBinlog、DDLBinlog |
|||
|
|||
Binlog is stored in a columnar storage format, every column in schema is stored in an individual file. |
|||
Timestamp, schema, row id and primary key allocated by system are four special columns. |
|||
Schema column records the DDL of the collection. |
|||
|
|||
## Event format |
|||
|
|||
Binlog file consists of 4 bytes magic number and a series of events. The first event must be a descriptor event. |
|||
|
|||
### 8.1 Event format |
|||
|
|||
``` |
|||
+=====================================+=====================================================================+ |
|||
| event | Timestamp 0 : 8 | create timestamp | |
|||
| header +----------------------------+---------------------------------------------------------------------+ |
|||
| | TypeCode 8 : 1 | event type code | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | EventLength 9 : 4 | length of event, including header and data | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | NextPosition 13 : 4 | offset of next event from the start of file | |
|||
+=====================================+=====================================================================+ |
|||
| event | fixed part 17 : x | | |
|||
| data +----------------------------+---------------------------------------------------------------------+ |
|||
| | variable part | | |
|||
+=====================================+=====================================================================+ |
|||
``` |
|||
|
|||
### 8.2 Descriptor Event format |
|||
|
|||
``` |
|||
+=====================================+=====================================================================+ |
|||
| event | Timestamp 0 : 8 | create timestamp | |
|||
| header +----------------------------+---------------------------------------------------------------------+ |
|||
| | TypeCode 8 : 1 | event type code | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | EventLength 9 : 4 | length of event, including header and data | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | NextPosition 13 : 4 | offset of next event from the start of file | |
|||
+=====================================+=====================================================================+ |
|||
| event | CollectionID 17 : 8 | collection id | |
|||
| data +----------------------------+---------------------------------------------------------------------+ |
|||
| | PartitionID 25 : 8 | partition id (schema column does not need) | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | SegmentID 33 : 8 | segment id (schema column does not need) | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | FieldID 41 : 8 | field id (schema column does not need) | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | StartTimestamp 49 : 8 | minimum timestamp allocated by master of all events in this file | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | EndTimestamp 57 : 8 | maximum timestamp allocated by master of all events in this file | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | PayloadDataType 65 : 4 | data type of payload | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | PostHeaderLengths n : n | header lengths for all event types | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | ExtraLength 69 : 4 | length of extra information | |
|||
| +----------------------------+---------------------------------------------------------------------+ |
|||
| | ExtraBytes 73 : n | extra information in json format | |
|||
+=====================================+=====================================================================| |
|||
``` |
|||
|
|||
`ExtraBytes` is in json format. |
|||
|
|||
`ExtraBytes` stores the extra information of the binlog file. |
|||
|
|||
In binlog file, we have stored many common fields in fixed part, such as `CollectionID`, `PartitionID` and etc. |
|||
|
|||
However, different binlog files have some other different information which differs from each other. |
|||
|
|||
So, `ExtraBytes` was designed to store this different information. |
|||
|
|||
For example, for index binlog file, we will store `indexID`, `indexBuildID`, `indexID` and other index-related |
|||
information to `ExtraBytes`. |
|||
|
|||
In addition, `ExtraBytes` was also designed to extend binlog. Then we can add new features to binlog file without |
|||
breaking the compatibility. |
|||
|
|||
For example, we can store the memory size of original content(before encoding) to `ExtraBytes`. |
|||
The key in `ExtraBytes` is `original_size`. For now, `original_size` is required, not optional. |
|||
|
|||
### 8.3 Type code |
|||
|
|||
``` |
|||
DESCRIPTOR_EVENT |
|||
INSERT_EVENT |
|||
DELETE_EVENT |
|||
CREATE_COLLECTION_EVENT |
|||
DROP_COLLECTION_EVENT |
|||
CREATE_PARTITION_EVENT |
|||
DROP_PARTITION_EVENT |
|||
INDEX_FILE_EVENT |
|||
``` |
|||
|
|||
DESCRIPTOR_EVENT must appear in all column files and always be the first event. |
|||
|
|||
INSERT_EVENT may appear in any column binlog except DDL binlog files. |
|||
|
|||
DELETE_EVENT can only be used in primary key's binlog files(currently we can only delete by primary key). |
|||
|
|||
CREATE_COLLECTION_EVENT、DROP_COLLECTION_EVENT、CREATE_PARTITION_EVENT、DROP_PARTITION_EVENT only appears in DDL binlog files. |
|||
|
|||
### 8.4 Event data part |
|||
|
|||
``` |
|||
event data part |
|||
|
|||
INSERT_EVENT: |
|||
+================================================+==========================================================+ |
|||
| event | fixed | StartTimestamp x : 8 | min timestamp in this event | |
|||
| data | part +------------------------------+----------------------------------------------------------+ |
|||
| | | EndTimestamp x+8 : 8 | max timestamp in this event | |
|||
| +--------+------------------------------+----------------------------------------------------------+ |
|||
| |variable| parquet payload | payload in parquet format | |
|||
| |part | | | |
|||
+================================================+==========================================================+ |
|||
|
|||
other events are similar with INSERT_EVENT |
|||
``` |
|||
|
|||
### 8.5 Example |
|||
|
|||
Schema |
|||
|
|||
string | int | float(optional) | vector(512) |
|||
|
|||
Request: |
|||
|
|||
InsertRequest rows(1W) |
|||
|
|||
DeleteRequest pk=1 |
|||
|
|||
DropPartition partitionTag="abc" |
|||
|
|||
insert binlogs: |
|||
|
|||
rowid, pk, ts, string, int, float, vector 6 files |
|||
|
|||
all events are INSERT_EVENT |
|||
float column file contains some NULL value |
|||
|
|||
delete binlogs: |
|||
|
|||
pk, ts 2 files |
|||
|
|||
pk's events are DELETE_EVENT, ts's events are INSERT_EVENT |
|||
|
|||
DDL binlogs: |
|||
|
|||
ddl, ts |
|||
|
|||
ddl's event is DROP_PARTITION_EVENT, ts's event is INSERT_EVENT |
|||
|
|||
C++ interface |
|||
|
|||
```c++ |
|||
typedef void* CPayloadWriter |
|||
typedef struct CBuffer { |
|||
char* data; |
|||
int length; |
|||
} CBuffer |
|||
|
|||
typedef struct CStatus { |
|||
int error_code; |
|||
const char* error_msg; |
|||
} CStatus |
|||
|
|||
|
|||
// C++ interface |
|||
// writer |
|||
CPayloadWriter NewPayloadWriter(int columnType); |
|||
CStatus AddBooleanToPayload(CPayloadWriter payloadWriter, bool *values, int length); |
|||
CStatus AddInt8ToPayload(CPayloadWriter payloadWriter, int8_t *values, int length); |
|||
CStatus AddInt16ToPayload(CPayloadWriter payloadWriter, int16_t *values, int length); |
|||
CStatus AddInt32ToPayload(CPayloadWriter payloadWriter, int32_t *values, int length); |
|||
CStatus AddInt64ToPayload(CPayloadWriter payloadWriter, int64_t *values, int length); |
|||
CStatus AddFloatToPayload(CPayloadWriter payloadWriter, float *values, int length); |
|||
CStatus AddDoubleToPayload(CPayloadWriter payloadWriter, double *values, int length); |
|||
CStatus AddOneStringToPayload(CPayloadWriter payloadWriter, char *cstr, int str_size); |
|||
CStatus AddBinaryVectorToPayload(CPayloadWriter payloadWriter, uint8_t *values, int dimension, int length); |
|||
CStatus AddFloatVectorToPayload(CPayloadWriter payloadWriter, float *values, int dimension, int length); |
|||
|
|||
CStatus FinishPayloadWriter(CPayloadWriter payloadWriter); |
|||
CBuffer GetPayloadBufferFromWriter(CPayloadWriter payloadWriter); |
|||
int GetPayloadLengthFromWriter(CPayloadWriter payloadWriter); |
|||
CStatus ReleasePayloadWriter(CPayloadWriter handler); |
|||
|
|||
// reader |
|||
CPayloadReader NewPayloadReader(int columnType, uint8_t *buffer, int64_t buf_size); |
|||
CStatus GetBoolFromPayload(CPayloadReader payloadReader, bool **values, int *length); |
|||
CStatus GetInt8FromPayload(CPayloadReader payloadReader, int8_t **values, int *length); |
|||
CStatus GetInt16FromPayload(CPayloadReader payloadReader, int16_t **values, int *length); |
|||
CStatus GetInt32FromPayload(CPayloadReader payloadReader, int32_t **values, int *length); |
|||
CStatus GetInt64FromPayload(CPayloadReader payloadReader, int64_t **values, int *length); |
|||
CStatus GetFloatFromPayload(CPayloadReader payloadReader, float **values, int *length); |
|||
CStatus GetDoubleFromPayload(CPayloadReader payloadReader, double **values, int *length); |
|||
CStatus GetOneStringFromPayload(CPayloadReader payloadReader, int idx, char **cstr, int *str_size); |
|||
CStatus GetBinaryVectorFromPayload(CPayloadReader payloadReader, uint8_t **values, int *dimension, int *length); |
|||
CStatus GetFloatVectorFromPayload(CPayloadReader payloadReader, float **values, int *dimension, int *length); |
|||
|
|||
int GetPayloadLengthFromReader(CPayloadReader payloadReader); |
|||
CStatus ReleasePayloadReader(CPayloadReader payloadReader); |
|||
``` |
@ -0,0 +1,354 @@ |
|||
## 9. Data Service |
|||
|
|||
#### 9.1 Overview |
|||
|
|||
<img src="./figs/data_coord.png" width=700> |
|||
|
|||
#### 9.2 Data Service Interface |
|||
|
|||
```go |
|||
type DataCoord interface { |
|||
Component |
|||
TimeTickProvider |
|||
|
|||
// Flush notifies DataCoord to flush all current growing segments of specified Collection |
|||
Flush(ctx context.Context, req *datapb.FlushRequest) (*datapb.FlushResponse, error) |
|||
// AssignSegmentID applies allocations for specified Coolection/Partition and related Channel Name(Virtial Channel) |
|||
AssignSegmentID(ctx context.Context, req *datapb.AssignSegmentIDRequest) (*datapb.AssignSegmentIDResponse, error) |
|||
// GetSegmentStates requests segment state information |
|||
GetSegmentStates(ctx context.Context, req *datapb.GetSegmentStatesRequest) (*datapb.GetSegmentStatesResponse, error) |
|||
// GetInsertBinlogPaths requests binlog paths for specified segment |
|||
GetInsertBinlogPaths(ctx context.Context, req *datapb.GetInsertBinlogPathsRequest) (*datapb.GetInsertBinlogPathsResponse, error) |
|||
// GetSegmentInfoChannel legacy API, returns segment info statistics channel |
|||
GetSegmentInfoChannel(ctx context.Context) (*milvuspb.StringResponse, error) |
|||
// GetCollectionStatistics requests collection statistics |
|||
GetCollectionStatistics(ctx context.Context, req *datapb.GetCollectionStatisticsRequest) (*datapb.GetCollectionStatisticsResponse, error) |
|||
// GetParititonStatistics requests partition statistics |
|||
GetPartitionStatistics(ctx context.Context, req *datapb.GetPartitionStatisticsRequest) (*datapb.GetPartitionStatisticsResponse, error) |
|||
// GetSegmentInfo requests segment info |
|||
GetSegmentInfo(ctx context.Context, req *datapb.GetSegmentInfoRequest) (*datapb.GetSegmentInfoResponse, error) |
|||
// GetRecoveryInfo request segment recovery info of collection/partition |
|||
GetRecoveryInfo(ctx context.Context, req *datapb.GetRecoveryInfoRequest) (*datapb.GetRecoveryInfoResponse, error) |
|||
// SaveBinlogPaths updates segments binlogs(including insert binlogs, stats logs and delta logs) |
|||
SaveBinlogPaths(ctx context.Context, req *datapb.SaveBinlogPathsRequest) (*commonpb.Status, error) |
|||
// GetFlushedSegments returns flushed segment list of requested collection/parition |
|||
GetFlushedSegments(ctx context.Context, req *datapb.GetFlushedSegmentsRequest) (*datapb.GetFlushedSegmentsResponse, error) |
|||
// GetMetrics gets the metrics about DataCoord |
|||
GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) |
|||
// CompleteCompaction completes a compaction with the result |
|||
CompleteCompaction(ctx context.Context, req *datapb.CompactionResult) (*commonpb.Status, error) |
|||
} |
|||
``` |
|||
|
|||
- _MsgBase_ |
|||
|
|||
```go |
|||
type MsgBase struct { |
|||
MsgType MsgType |
|||
MsgID UniqueID |
|||
Timestamp Timestamp |
|||
SourceID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _Flush_ |
|||
|
|||
```go |
|||
type FlushRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
} |
|||
``` |
|||
|
|||
- _AssignSegmentID_ |
|||
|
|||
```go |
|||
type SegmentIDRequest struct { |
|||
Count uint32 |
|||
ChannelName string |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
} |
|||
|
|||
type AssignSegmentIDRequest struct { |
|||
NodeID int64 |
|||
PeerRole string |
|||
SegmentIDRequests []*SegmentIDRequest |
|||
} |
|||
|
|||
type SegIDAssignment struct { |
|||
SegID UniqueID |
|||
ChannelName string |
|||
Count uint32 |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
ExpireTime uint64 |
|||
Status *commonpb.Status |
|||
} |
|||
|
|||
type AssignSegmentIDResponse struct { |
|||
SegIDAssignments []*SegmentIDAssignment |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _GetSegmentStates_ |
|||
|
|||
```go |
|||
type GetSegmentStatesRequest struct { |
|||
Base *commonpb.MsgBase |
|||
SegmentIDs []int64 |
|||
} |
|||
|
|||
type SegmentState int32 |
|||
|
|||
const ( |
|||
SegmentState_SegmentStateNone SegmentState = 0 |
|||
SegmentState_NotExist SegmentState = 1 |
|||
SegmentState_Growing SegmentState = 2 |
|||
SegmentState_Sealed SegmentState = 3 |
|||
SegmentState_Flushed SegmentState = 4 |
|||
SegmentState_Flushing SegmentState = 5 |
|||
SegmentState_Dropped SegmentState = 6 |
|||
) |
|||
|
|||
type SegmentStateInfo struct { |
|||
SegmentID UniqueID |
|||
State commonpb.SegmentState |
|||
StartPosition *msgpb.MsgPosition |
|||
EndPosition *msgpb.MsgPosition |
|||
Status *commonpb.Status |
|||
} |
|||
|
|||
type GetSegmentStatesResponse struct { |
|||
Status *commonpb.Status |
|||
States []*SegmentStateInfo |
|||
} |
|||
``` |
|||
|
|||
- _GetInsertBinlogPaths_ |
|||
|
|||
```go |
|||
type GetInsertBinlogPathsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
SegmentID UniqueID |
|||
} |
|||
|
|||
type GetInsertBinlogPathsResponse struct { |
|||
FieldIDs []int64 |
|||
Paths []*internalpb.StringList |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _GetCollectionStatistics_ |
|||
|
|||
```go |
|||
type GetCollectionStatisticsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID int64 |
|||
CollectionID int64 |
|||
} |
|||
|
|||
type GetCollectionStatisticsResponse struct { |
|||
Stats []*commonpb.KeyValuePair |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _GetPartitionStatistics_ |
|||
|
|||
```go |
|||
type GetPartitionStatisticsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
} |
|||
|
|||
type GetPartitionStatisticsResponse struct { |
|||
Stats []*commonpb.KeyValuePair |
|||
Status *commonpb.Status |
|||
} |
|||
``` |
|||
|
|||
- _GetSegmentInfo_ |
|||
|
|||
```go |
|||
type GetSegmentInfoRequest struct{ |
|||
Base *commonpb.MsgBase |
|||
SegmentIDs []UniqueID |
|||
} |
|||
|
|||
type SegmentInfo struct { |
|||
ID int64 |
|||
CollectionID int64 |
|||
PartitionID int64 |
|||
InsertChannel string |
|||
NumOfRows int64 |
|||
State commonpb.SegmentState |
|||
DmlPosition *msgpb.MsgPosition |
|||
MaxRowNum int64 |
|||
LastExpireTime uint64 |
|||
StartPosition *msgpb.MsgPosition |
|||
DmlPosition *msgpb.MsgPosition |
|||
Binlogs []*FieldBinlog |
|||
} |
|||
|
|||
type GetSegmentInfoResponse struct{ |
|||
Status *commonpb.Status |
|||
infos []SegmentInfo |
|||
} |
|||
``` |
|||
|
|||
- _GetRecoveryInfo_ |
|||
|
|||
```go |
|||
type GetRecoveryInfoRequest struct { |
|||
Base *commonpb.MsgBase |
|||
CollectionID int64 |
|||
PartitionID int64 |
|||
} |
|||
|
|||
|
|||
type VchannelInfo struct { |
|||
CollectionID int64 |
|||
ChannelName string |
|||
SeekPosition *msgpb.MsgPosition |
|||
UnflushedSegments []*SegmentInfo |
|||
FlushedSegments []int64 |
|||
} |
|||
|
|||
type SegmentBinlogs struct { |
|||
SegmentID int64 |
|||
FieldBinlogs []*FieldBinlog |
|||
} |
|||
|
|||
type GetRecoveryInfoResponse struct { |
|||
Status *commonpb.Status |
|||
Channels []*VchannelInfo |
|||
Binlogs []*SegmentBinlogs |
|||
} |
|||
``` |
|||
|
|||
- _SaveBinlogPaths_ |
|||
|
|||
```go |
|||
type SegmentStartPosition struct { |
|||
StartPosition *msgpb.MsgPosition |
|||
SegmentID int64 |
|||
} |
|||
|
|||
type SaveBinlogPathsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
SegmentID int64 |
|||
CollectionID int64 |
|||
Field2BinlogPaths []*ID2PathList |
|||
CheckPoints []*CheckPoint |
|||
StartPositions []*SegmentStartPosition |
|||
Flushed bool |
|||
} |
|||
``` |
|||
|
|||
#### 9.3 Insert Channel |
|||
|
|||
- _InsertMsg_ |
|||
|
|||
```go |
|||
type InsertRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbName string |
|||
CollectionName string |
|||
PartitionName string |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
PartitionID UniqueID |
|||
SegmentID UniqueID |
|||
ChannelID string |
|||
Timestamps []uint64 |
|||
RowIDs []int64 |
|||
RowData []*commonpb.Blob |
|||
} |
|||
|
|||
type InsertMsg struct { |
|||
BaseMsg |
|||
InsertRequest |
|||
} |
|||
``` |
|||
|
|||
#### 9.4 Data Node Interface |
|||
|
|||
```go |
|||
type DataNode interface { |
|||
Component |
|||
|
|||
/// WatchDmChannels watches the channels about data manipulation. |
|||
WatchDmChannels(ctx context.Context, req *datapb.WatchDmChannelsRequest) (*commonpb.Status, error) |
|||
// FlushSegments notifies DataNode to flush the segments req provids. The flush tasks are async to this rpc, DataNode will flush the segments in the background. |
|||
FlushSegments(ctx context.Context, req *datapb.FlushSegmentsRequest) (*commonpb.Status, error) |
|||
// GetMetrics gets the metrics about DataNode. |
|||
GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) |
|||
// Compaction will add a compaction task according to the request plan |
|||
Compaction(ctx context.Context, req *datapb.CompactionPlan) (*commonpb.Status, error) |
|||
} |
|||
``` |
|||
|
|||
- _WatchDmChannels_ |
|||
|
|||
```go |
|||
type WatchDmChannelRequest struct { |
|||
Base *commonpb.MsgBase |
|||
Vchannels []*VchannelInfo |
|||
} |
|||
``` |
|||
|
|||
- _FlushSegments_ |
|||
|
|||
```go |
|||
type FlushSegmentsRequest struct { |
|||
Base *commonpb.MsgBase |
|||
DbID UniqueID |
|||
CollectionID UniqueID |
|||
SegmentIDs []int64 |
|||
} |
|||
``` |
|||
|
|||
#### 9.5 SegmentStatistics Update Channel |
|||
|
|||
- _SegmentStatisticsMsg_ |
|||
|
|||
```go |
|||
type SegmentStatisticsUpdates struct { |
|||
SegmentID UniqueID |
|||
MemorySize int64 |
|||
NumRows int64 |
|||
CreateTime uint64 |
|||
EndTime uint64 |
|||
StartPosition *msgpb.MsgPosition |
|||
EndPosition *msgpb.MsgPosition |
|||
} |
|||
|
|||
type SegmentStatistics struct { |
|||
Base *commonpb.MsgBase |
|||
SegStats []*SegmentStatisticsUpdates |
|||
} |
|||
|
|||
type SegmentStatisticsMsg struct { |
|||
BaseMsg |
|||
SegmentStatistics |
|||
} |
|||
|
|||
``` |
|||
|
|||
#### 9.6 DataNode Time Tick Channel |
|||
|
|||
- _DataNode Tt Msg_ |
|||
|
|||
```go |
|||
message DataNodeTtMsg { |
|||
Base *commonpb.MsgBase |
|||
ChannelName string |
|||
Timestamp uint64 |
|||
} |
|||
``` |
@ -0,0 +1,13 @@ |
|||
# Milvus Developer Guides |
|||
|
|||
by Rentong Guo Sep 15, 2020 |
|||
|
|||
## Acknowledgement |
|||
|
|||
TODO: a formal acknowledgement. |
|||
|
|||
main content: Rentong Guo, Qingxiang Chen (appendix b) |
|||
|
|||
figures: Xuan Yang, Zhenshan Cao |
|||
|
|||
design suggestions: Zhenshan Cao, Xi Ge, Yefu Chen, Guilin Gou, Yihao Dai, Jiquan Long, Xiaomeng Yi, Peng Xu, Hai Jin, Xiangzhou Guo |
After Width: | Height: | Size: 123 KiB |
After Width: | Height: | Size: 294 KiB |
After Width: | Height: | Size: 228 KiB |
After Width: | Height: | Size: 122 KiB |
After Width: | Height: | Size: 57 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 77 KiB |
After Width: | Height: | Size: 65 KiB |
After Width: | Height: | Size: 18 KiB |