稀疏向量

稀疏向量是信息检索和自然语言处理中重要的数据表示方法。虽然密集向量因其出色的语义理解能力而受到欢迎，但在需要精确匹配关键词或短语的应用中，稀疏向量往往能提供更准确的结果。

概述

稀疏向量是高维向量的一种特殊表示，其中大部分元素为零，只有少数几个维度具有非零值。这一特性使稀疏向量在处理大规模、高维但稀疏的数据时特别有效。常见应用包括：

文本分析： 将文档表示为词袋向量，其中每个维度对应一个词，只有在文档中出现的词才有非零值。
推荐系统： 用户-物品交互矩阵，其中每个维度代表用户对特定物品的评分，大多数用户只与少数物品互动。
图像处理： 局部特征表示，只关注图像中的关键点，产生高维稀疏向量。

如下图所示，密集向量通常表示为连续数组，其中每个位置都有值（例如 [0.3, 0.8, 0.2, 0.3, 0.1]）。相比之下，稀疏向量只存储非零元素及其索引，通常表示为键值对（例如 [{2: 0.2}, ..., {9997: 0.5}, {9999: 0.7}]）。这种表示显著减少了存储空间并提高了计算效率，特别是在处理极高维数据（例如 10,000 维）时。

Sparse Vector

稀疏向量可以使用各种方法生成，如文本处理中的 TF-IDF（词频-逆文档频率）和 BM25。此外，Milvus 提供了便利的方法来帮助生成和处理稀疏向量。有关详细信息，请参阅 Embeddings。

对于文本数据，Milvus 还提供全文搜索功能，允许您直接对原始文本数据执行向量搜索，而无需使用外部 embedding 模型来生成稀疏向量。有关更多信息，请参阅全文搜索。

向量化后，数据可以存储在 Milvus 中进行管理和向量检索。下图说明了基本流程。

Use Sparse Vector

除了稀疏向量，Milvus 还支持密集向量和二进制向量。密集向量非常适合捕获深层语义关系，而二进制向量在快速相似性比较和内容去重等场景中表现出色。有关更多信息，请参阅密集向量和二进制向量。

使用稀疏向量

Milvus 支持以下任何格式表示稀疏向量：

稀疏矩阵（使用 scipy.sparse 类）

from scipy.sparse import csr_matrix

# Create a sparse matrix
row = [0, 0, 1, 2, 2, 2]
col = [0, 2, 2, 0, 1, 2]
data = [1, 2, 3, 4, 5, 6]
sparse_matrix = csr_matrix((data, (row, col)), shape=(3, 3))

# Represent sparse vector using the sparse matrix
sparse_vector = sparse_matrix.getrow(0)

字典列表（格式为 {dimension_index: value, ...}）

Python Java

# Represent sparse vector using a dictionary
sparse_vector = [{1: 0.5, 100: 0.3, 500: 0.8, 1024: 0.2, 5000: 0.6}]

SortedMap<Long, Float> sparseVector = new TreeMap<>();
sparseVector.put(1L, 0.5f);
sparseVector.put(100L, 0.3f);
sparseVector.put(500L, 0.8f);
sparseVector.put(1024L, 0.2f);
sparseVector.put(5000L, 0.6f);

元组迭代器列表（格式为 [(dimension_index, value)]）

# Represent sparse vector using a list of tuples
sparse_vector = [[(1, 0.5), (100, 0.3), (500, 0.8), (1024, 0.2), (5000, 0.6)]]

添加向量 field

要在 Milvus 中使用稀疏向量，在创建 collection 时定义一个用于存储稀疏向量的 field。此过程包括：

将 datatype 设置为支持的稀疏向量数据类型 SPARSE_FLOAT_VECTOR。
无需指定维度。

Python Java NodeJS Go cURL

from pymilvus import MilvusClient, DataType

client = MilvusClient(uri="http://localhost:19530")

schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)

schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)

import io.milvus.v2.client.ConnectConfig;
import io.milvus.v2.client.MilvusClientV2;

import io.milvus.v2.common.DataType;
import io.milvus.v2.service.collection.request.AddFieldReq;
import io.milvus.v2.service.collection.request.CreateCollectionReq;

MilvusClientV2 client = new MilvusClientV2(ConnectConfig.builder()
        .uri("http://localhost:19530")
        .build());
        
CreateCollectionReq.CollectionSchema schema = client.createSchema();
schema.setEnableDynamicField(true);
schema.addField(AddFieldReq.builder()
        .fieldName("pk")
        .dataType(DataType.VarChar)
        .isPrimaryKey(true)
        .autoID(true)
        .maxLength(100)
        .build());

schema.addField(AddFieldReq.builder()
        .fieldName("sparse_vector")
        .dataType(DataType.SparseFloatVector)
        .build());

import { DataType } from "@zilliz/milvus2-sdk-node";

const schema = [
  {
    name: "metadata",
    data_type: DataType.JSON,
  },
  {
    name: "pk",
    data_type: DataType.Int64,
    is_primary_key: true,
  },
  {
    name: "sparse_vector",
    data_type: DataType.SparseFloatVector,
  }
];

import (
    "context"
    "fmt"

    "github.com/milvus-io/milvus/client/v2/column"
    "github.com/milvus-io/milvus/client/v2/entity"
    "github.com/milvus-io/milvus/client/v2/index"
    "github.com/milvus-io/milvus/client/v2/milvusclient"
)

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

milvusAddr := "localhost:19530"
client, err := milvusclient.New(ctx, &milvusclient.ClientConfig{
    Address: milvusAddr,
})
if err != nil {
    fmt.Println(err.Error())
    // handle error
}
defer client.Close(ctx)

schema := entity.NewSchema()
schema.WithField(entity.NewField().
    WithName("pk").
    WithDataType(entity.FieldTypeVarChar).
    WithIsAutoID(true).
    WithIsPrimaryKey(true).
    WithMaxLength(100),
).WithField(entity.NewField().
    WithName("sparse_vector").
    WithDataType(entity.FieldTypeSparseVector),
)

export primaryField='{
    "fieldName": "pk",
    "dataType": "VarChar",
    "isPrimary": true,
    "elementTypeParams": {
        "max_length": 100
    }
}'

export vectorField='{
    "fieldName": "sparse_vector",
    "dataType": "SparseFloatVector"
}'

export schema="{
    \"autoID\": true,
    \"fields\": [
        $primaryField,
        $vectorField
    ]
}"

In this example, a vector field named sparse_vector is added for storing sparse vectors. The data type of this field is SPARSE_FLOAT_VECTOR.

为向量 field 设置 index 参数

为了加速基于稀疏向量的搜索，必须为向量 field 创建 index。

Python Java NodeJS Go cURL

# Prepare index parameters
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse_vector",
    index_name="sparse_vector_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="IP"
)

import io.milvus.v2.common.IndexParam;
import java.util.*;

List<IndexParam> indexes = new ArrayList<>();

indexes.add(IndexParam.builder()
        .fieldName("sparse_vector")
        .indexType(IndexParam.IndexType.SPARSE_INVERTED_INDEX)
        .metricType(IndexParam.MetricType.IP)
        .build());

import { MetricType, IndexType } from "@zilliz/milvus2-sdk-node";

const indexParams = {
    index_name: 'sparse_vector_index',
    field_name: 'sparse_vector',
    metric_type: MetricType.IP,
    index_type: IndexType.SPARSE_INVERTED_INDEX
};

idx := index.NewSparseInvertedIndex(index.MetricType(entity.IP))
indexOption := milvusclient.NewCreateIndexOption("my_collection", "sparse_vector", idx)

export indexParams='[
        {
            "fieldName": "sparse_vector",
            "metricType": "IP",
            "indexName": "sparse_vector_index",
            "indexType": "SPARSE_INVERTED_INDEX"
        }
    ]'

在上面的示例中，使用 SPARSE_INVERTED_INDEX index 类型为 sparse_vector field 创建了一个名为 sparse_vector_index 的 index。目前 metric_type 只支持 IP（内积）。

Milvus 还提供其他度量类型，如 BM25，这是一种专门为全文搜索设计的度量。有关更多信息，请参阅度量类型。

创建 collection

完成稀疏向量和 index 参数设置后，您可以创建包含稀疏向量的 collection。下面的示例使用 create_collection 方法创建一个名为 my_collection 的 collection。

Python Java NodeJS Go cURL

client.create_collection(
    collection_name="my_collection",
    schema=schema,
    index_params=index_params
)

import io.milvus.v2.client.ConnectConfig;
import io.milvus.v2.client.MilvusClientV2;

MilvusClientV2 client = new MilvusClientV2(ConnectConfig.builder()
        .uri("http://localhost:19530")
        .build());

CreateCollectionReq requestCreate = CreateCollectionReq.builder()
        .collectionName("my_collection")
        .collectionSchema(schema)
        .indexParams(indexes)
        .build();
client.createCollection(requestCreate);

import { MilvusClient } from "@zilliz/milvus2-sdk-node";

const client = new MilvusClient({
    address: 'http://localhost:19530'
});

await client.createCollection({
    collection_name: 'my_collection',
    schema: schema,
    index_params: indexParams
});

err = client.CreateCollection(ctx,
    milvusclient.NewCreateCollectionOption("my_collection", schema).
        WithIndexOptions(indexOption))
if err != nil {
    fmt.Println(err.Error())
    // handle error
}

curl --request POST \
--url "${CLUSTER_ENDPOINT}/v2/vectordb/collections/create" \
--header "Authorization: Bearer ${TOKEN}" \
--header "Content-Type: application/json" \
-d "{
    \"collectionName\": \"my_collection\",
    \"schema\": $schema,
    \"indexParams\": $indexParams
}"

插入数据

创建 collection 后，插入包含稀疏向量的数据。

Python Java NodeJS Go cURL

sparse_vectors = [
    {"sparse_vector": {1: 0.5, 100: 0.3, 500: 0.8}},
    {"sparse_vector": {10: 0.1, 200: 0.7, 1000: 0.9}},
]

client.insert(
    collection_name="my_collection",
    data=sparse_vectors
)

import com.google.gson.Gson;
import com.google.gson.JsonObject;
import io.milvus.v2.service.vector.request.InsertReq;
import io.milvus.v2.service.vector.response.InsertResp;

List<JsonObject> rows = new ArrayList<>();
Gson gson = new Gson();
{
    JsonObject row = new JsonObject();
    SortedMap<Long, Float> sparse = new TreeMap<>();
    sparse.put(1L, 0.5f);
    sparse.put(100L, 0.3f);
    sparse.put(500L, 0.8f);
    row.add("sparse_vector", gson.toJsonTree(sparse));
    rows.add(row);
}
{
    JsonObject row = new JsonObject();
    SortedMap<Long, Float> sparse = new TreeMap<>();
    sparse.put(10L, 0.1f);
    sparse.put(200L, 0.7f);
    sparse.put(1000L, 0.9f);
    row.add("sparse_vector", gson.toJsonTree(sparse));
    rows.add(row);
}

InsertResp insertR = client.insert(InsertReq.builder()
        .collectionName("my_collection")
        .data(rows)
        .build());

const data = [
  { sparse_vector: { "1": 0.5, "100": 0.3, "500": 0.8 } },
  { sparse_vector: { "10": 0.1, "200": 0.7, "1000": 0.9 } },
];
client.insert({
  collection_name: "my_collection",
  data: data,
});

v := make([]entity.SparseEmbedding, 0, 2)
sparseVector1, _ := entity.NewSliceSparseEmbedding([]uint32{1, 100, 500}, []float32{0.5, 0.3, 0.8})
v = append(v, sparseVector1)
sparseVector2, _ := entity.NewSliceSparseEmbedding([]uint32{10, 200, 1000}, []float32{0.1, 0.7, 0.9})
v = append(v, sparseVector2)
column := column.NewColumnSparseVectors("sparse_vector", v)

_, err = client.Insert(ctx, milvusclient.NewColumnBasedInsertOption("my_collection").
    WithColumns(column))
if err != nil {
    fmt.Println(err.Error())
    // handle err
}

curl --request POST \
--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/insert" \
--header "Authorization: Bearer ${TOKEN}" \
--header "Content-Type: application/json" \
-d '{
    "data": [
        {"sparse_vector": {"1": 0.5, "100": 0.3, "500": 0.8}},
        {"sparse_vector": {"10": 0.1, "200": 0.7, "1000": 0.9}}        
    ],
    "collectionName": "my_collection"
}'

## {"code":0,"cost":0,"data":{"insertCount":2,"insertIds":["453577185629572534","453577185629572535"]}}

执行相似性搜索

要使用稀疏向量执行相似性搜索，请准备查询向量和搜索参数。

# Prepare search parameters
search_params = {
    "params": {"drop_ratio_search": 0.2},  # A tunable drop ratio parameter with a valid range between 0 and 1
}

# Prepare the query vector
query_vector = [{1: 0.2, 50: 0.4, 1000: 0.7}]

在此示例中，drop_ratio_search 是专门针对稀疏向量的可选参数，允许在搜索过程中对查询向量中的小值进行微调。例如，使用 {"drop_ratio_search": 0.2}，查询向量中最小的 20% 的值将在搜索过程中被忽略。

然后，使用 search 方法执行相似性搜索：

Python Java NodeJS Go cURL

res = client.search(
    collection_name="my_collection",
    data=query_vector,
    limit=3,
    output_fields=["pk"],
    search_params=search_params,
)

print(res)

# Output
# data: ["[{'id': '453718927992172266', 'distance': 0.6299999952316284, 'entity': {'pk': '453718927992172266'}}, {'id': '453718927992172265', 'distance': 0.10000000149011612, 'entity': {'pk': '453718927992172265'}}]"]

import io.milvus.v2.service.vector.request.SearchReq;
import io.milvus.v2.service.vector.request.data.SparseFloatVec;
import io.milvus.v2.service.vector.response.SearchResp;

Map<String,Object> searchParams = new HashMap<>();
searchParams.put("drop_ratio_search", 0.2);

SortedMap<Long, Float> sparse = new TreeMap<>();
sparse.put(1L, 0.2f);
sparse.put(50L, 0.4f);
sparse.put(1000L, 0.7f);

SparseFloatVec queryVector = new SparseFloatVec(sparse);

SearchResp searchR = client.search(SearchReq.builder()
        .collectionName("my_collection")
        .data(Collections.singletonList(queryVector))
        .annsField("sparse_vector")
        .searchParams(searchParams)
        .topK(3)
        .outputFields(Collections.singletonList("pk"))
        .build());
        
System.out.println(searchR.getSearchResults());

// Output
//
// [[SearchResp.SearchResult(entity={pk=457270974427187729}, score=0.63, id=457270974427187729), SearchResp.SearchResult(entity={pk=457270974427187728}, score=0.1, id=457270974427187728)]]

await client.search({
    collection_name: 'my_collection',
    data: {1: 0.2, 50: 0.4, 1000: 0.7},
    limit: 3,
    output_fields: ['pk'],
    params: {
        drop_ratio_search: 0.2
    }
});

queryVector, _ := entity.NewSliceSparseEmbedding([]uint32{1, 50, 1000}, []float32{0.2, 0.4, 0.7})

annSearchParams := index.NewCustomAnnParam()
annSearchParams.WithExtraParam("drop_ratio_search", 0.2)
resultSets, err := client.Search(ctx, milvusclient.NewSearchOption(
    "my_collection", // collectionName
    3,                      // limit
    []entity.Vector{entity.SparseEmbedding(queryVector)},
).WithANNSField("sparse_vector").
    WithOutputFields("pk").
    WithAnnParam(annSearchParams))
if err != nil {
    fmt.Println(err.Error())
    // handle err
}

for _, resultSet := range resultSets {
    fmt.Println("IDs: ", resultSet.IDs.FieldData().GetScalars())
    fmt.Println("Scores: ", resultSet.Scores)
    fmt.Println("Pks: ", resultSet.GetColumn("pk").FieldData().GetScalars())
}

// Results:
//   IDs:  string_data:{data:"457270974427187705"  data:"457270974427187704"}
//   Scores:  [0.63 0.1]
//   Pks:  string_data:{data:"457270974427187705"  data:"457270974427187704"}

curl --request POST \
--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/search" \
--header "Authorization: Bearer ${TOKEN}" \
--header "Content-Type: application/json" \
-d '{
    "collectionName": "my_collection",
    "data": [
        {"1": 0.2, "50": 0.4, "1000": 0.7}
    ],
    "annsField": "sparse_vector",
    "limit": 3,
    "searchParams":{
        "params":{"drop_ratio_search": 0.2}
    },
    "outputFields": ["pk"]
}'

## {"code":0,"cost":0,"data":[{"distance":0.63,"id":"453577185629572535","pk":"453577185629572535"},{"distance":0.1,"id":"453577185629572534","pk":"453577185629572534"}]}

For more information on similarity search parameters, refer to 基础 ANN 搜索.

概述​

使用稀疏向量​

添加向量 field​

为向量 field 设置 index 参数​

创建 collection​

插入数据​

执行相似性搜索​

概述

使用稀疏向量

添加向量 field

为向量 field 设置 index 参数

创建 collection

插入数据

执行相似性搜索