Elasticsearch¶
从基础到高级,涵盖安装、索引管理、CRUD、查询DSL、聚合、分词、集群、性能优化、Python客户端等核心知识点。以 Elasticsearch 8.x 为基准。
目录¶
-
安装与启动
核心概念
REST API 基础
索引管理
-
字段类型
动态 Mapping
自定义 Mapping
索引模板
-
新增文档
查询文档
更新文档
删除文档
批量操作
-
全文查询
精确查询
范围查询
复合查询(bool)
嵌套查询
地理位置查询
-
内置分析器
中文分词(IK)
自定义分析器
分析 API
-
Bucket 聚合
Metric 聚合
Pipeline 聚合
嵌套聚合
-
相关性评分
高亮显示
分页与深度分页
排序
字段折叠
搜索建议
-
写入优化
查询优化
索引生命周期(ILM)
冷热分层
-
集群架构
节点类型
分片与副本
集群监控
-
安装与连接
索引操作
文档 CRUD
搜索
聚合
批量操作
一、基础篇¶
1.1 安装与启动¶
# Docker 启动(推荐开发环境)
docker run -d \
--name elasticsearch \
-p 9200:9200 \
-p 9300:9300 \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=false" \
-e "ES_JAVA_OPTS=-Xms1g -Xmx1g" \
elasticsearch:8.12.0
# Docker Compose(ES + Kibana)
# 见部署篇
# 验证
curl http://localhost:9200
curl http://localhost:9200/_cluster/health?pretty
# Ubuntu 安装
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
sudo apt update && sudo apt install elasticsearch
sudo systemctl start elasticsearch
1.2 核心概念¶
概念 |
类比关系型数据库 |
说明 |
|---|---|---|
Index(索引) |
Database(数据库) |
文档的集合 |
Document(文档) |
Row(行) |
一条数据,JSON 格式 |
Field(字段) |
Column(列) |
文档的属性 |
Mapping(映射) |
Schema(表结构) |
字段类型定义 |
Shard(分片) |
- |
索引的水平拆分 |
Replica(副本) |
- |
分片的备份 |
Node(节点) |
- |
一个 ES 实例 |
Cluster(集群) |
- |
多个节点的集合 |
关键概念说明
倒排索引:将词条映射到文档ID,实现全文搜索的核心数据结构
分片(Shard):索引数据的水平分割,默认1个主分片,创建后不可修改
副本(Replica):主分片的备份,提高可用性和读性能,可动态修改数量
段(Segment):Lucene 的基本存储单位,不可变,定期合并
1.3 REST API 基础¶
# ES 使用 RESTful API,格式:
# METHOD /index/_action
# Content-Type: application/json
# 常用端点
GET / # 集群信息
GET /_cluster/health # 集群健康
GET /_cat/indices?v # 查看所有索引(表格格式)
GET /_cat/nodes?v # 查看所有节点
GET /_cat/shards?v # 查看分片分布
GET /_cat/aliases?v # 查看别名
# 索引操作
PUT /my_index # 创建索引
GET /my_index # 查看索引信息
DELETE /my_index # 删除索引
HEAD /my_index # 检查索引是否存在(200/404)
# 文档操作
POST /my_index/_doc # 新增(自动生成ID)
PUT /my_index/_doc/1 # 新增/替换(指定ID)
GET /my_index/_doc/1 # 获取文档
DELETE /my_index/_doc/1 # 删除文档
POST /my_index/_update/1 # 更新文档
POST /my_index/_bulk # 批量操作
POST /my_index/_search # 搜索
1.4 索引管理¶
# 创建索引(指定分片、副本、Mapping)
PUT /articles
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 1,
"refresh_interval": "1s",
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word"
}
}
}
},
"mappings": {
"properties": {
"title": { "type": "text", "analyzer": "ik_max_word" },
"content": { "type": "text", "analyzer": "ik_max_word" },
"author": { "type": "keyword" },
"tags": { "type": "keyword" },
"views": { "type": "integer" },
"publish_at": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||epoch_millis" }
}
}
}
# 修改索引设置(不能修改分片数)
PUT /articles/_settings
{
"number_of_replicas": 2,
"refresh_interval": "5s"
}
# 关闭/打开索引(关闭后不可读写,但占用资源极少)
POST /articles/_close
POST /articles/_open
# 索引别名(解耦索引名与应用,支持零停机重建索引)
POST /_aliases
{
"actions": [
{ "add": { "index": "articles_v2", "alias": "articles" } },
{ "remove": { "index": "articles_v1", "alias": "articles" } }
]
}
# 为别名添加过滤器(虚拟视图)
POST /_aliases
{
"actions": [{
"add": {
"index": "articles",
"alias": "published_articles",
"filter": { "term": { "status": "published" } }
}
}]
}
# Reindex(重建索引,用于修改 Mapping)
POST /_reindex
{
"source": { "index": "articles_v1" },
"dest": { "index": "articles_v2" }
}
# 按条件 Reindex
POST /_reindex
{
"source": {
"index": "articles_v1",
"query": { "term": { "status": "published" } }
},
"dest": { "index": "articles_v2" }
}
二、Mapping 篇¶
2.1 字段类型¶
文本类型
类型 |
说明 |
|---|---|
|
全文搜索,会分词,不支持排序/聚合 |
|
精确匹配,不分词,支持排序/聚合/过滤 |
|
只支持全文搜索,节省存储(8.0+) |
数值类型
类型 |
说明 |
|---|---|
|
整数 |
|
浮点数 |
|
缩放浮点(如价格用 scaling_factor=100) |
|
无符号长整型 |
其他类型
类型 |
说明 |
|---|---|
|
布尔 |
|
日期 |
|
IP 地址 |
|
地理坐标(经纬度) |
|
地理形状 |
|
嵌套对象(扁平化存储) |
|
嵌套对象(独立索引,支持独立查询) |
|
扁平化对象(key 为 keyword) |
|
稠密向量(向量搜索) |
2.2 动态 Mapping¶
# ES 自动推断字段类型规则
# true/false → boolean
# 123 → long
# 1.5 → float
# "2024-01-01" → date(匹配日期格式)
# "hello" → text + keyword(自动multi-field)
# 查看自动生成的 Mapping
GET /my_index/_mapping
# 动态 Mapping 控制
PUT /my_index
{
"mappings": {
"dynamic": "strict" # true=自动创建(默认), false=忽略新字段, strict=新字段报错
}
}
# 动态模板(批量定义字段规则)
PUT /my_index
{
"mappings": {
"dynamic_templates": [
{
"strings_as_keyword": {
"match_mapping_type": "string",
"mapping": { "type": "keyword" } # 所有字符串字段默认 keyword
}
},
{
"long_as_integer": {
"match_mapping_type": "long",
"mapping": { "type": "integer" }
}
},
{
"price_fields": {
"match": "*_price", # 字段名匹配
"mapping": {
"type": "scaled_float",
"scaling_factor": 100
}
}
}
]
}
}
2.3 自定义 Mapping¶
PUT /products
{
"mappings": {
"properties": {
"id": { "type": "long" },
"name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"keyword": { "type": "keyword", "ignore_above": 256 }
# name.keyword 可以精确匹配和聚合
}
},
"description": {
"type": "text",
"analyzer": "ik_max_word",
"index_options": "positions" # offsets/positions/freqs/docs
},
"price": {
"type": "scaled_float",
"scaling_factor": 100
},
"category": { "type": "keyword" },
"tags": { "type": "keyword" },
"stock": { "type": "integer" },
"is_active": { "type": "boolean" },
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"location": { "type": "geo_point" },
"images": {
"type": "object",
"properties": {
"url": { "type": "keyword" },
"width": { "type": "integer" },
"height": { "type": "integer" }
}
},
"specs": {
"type": "nested", # 用 nested 才能独立查询数组中的对象
"properties": {
"name": { "type": "keyword" },
"value": { "type": "keyword" }
}
},
"suggest": {
"type": "completion" # 搜索建议字段
}
}
}
}
2.4 索引模板¶
# 索引模板(新建匹配的索引时自动应用)
PUT /_index_template/logs_template
{
"index_patterns": ["logs-*", "events-*"],
"priority": 100,
"template": {
"settings": {
"number_of_shards": 2,
"number_of_replicas": 1,
"refresh_interval": "5s"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"level": { "type": "keyword" },
"message": { "type": "text" },
"service": { "type": "keyword" },
"host": { "type": "keyword" }
}
},
"aliases": {
"all_logs": {}
}
}
}
# 组件模板(可复用的 Mapping 片段)
PUT /_component_template/timestamp_mapping
{
"template": {
"mappings": {
"properties": {
"created_at": { "type": "date" },
"updated_at": { "type": "date" }
}
}
}
}
# 在索引模板中引用组件模板
PUT /_index_template/my_template
{
"index_patterns": ["my_*"],
"composed_of": ["timestamp_mapping"]
}
三、文档 CRUD 篇¶
3.1 新增文档¶
# 自动生成 ID(POST)
POST /articles/_doc
{
"title": "Elasticsearch 入门",
"author": "Alice",
"tags": ["搜索", "大数据"],
"views": 100,
"publish_at": "2024-01-01 10:00:00"
}
# 指定 ID(PUT/POST)
PUT /articles/_doc/1
{
"title": "Elasticsearch 入门",
"author": "Alice"
}
# 仅创建,ID 已存在则报错(op_type=create)
PUT /articles/_doc/1?op_type=create
PUT /articles/_create/1
{
"title": "新文章"
}
3.2 查询文档¶
# 根据 ID 获取
GET /articles/_doc/1
# 只返回 _source
GET /articles/_source/1
# 指定返回字段
GET /articles/_doc/1?_source=title,author
# 批量获取(mget)
GET /_mget
{
"docs": [
{ "_index": "articles", "_id": "1" },
{ "_index": "articles", "_id": "2", "_source": ["title"] }
]
}
# 同一索引批量获取
GET /articles/_mget
{
"ids": ["1", "2", "3"]
}
# 检查文档是否存在
HEAD /articles/_doc/1
3.3 更新文档¶
# 部分更新(update,保留原有字段)
POST /articles/_update/1
{
"doc": {
"views": 200,
"title": "Elasticsearch 入门(更新版)"
}
}
# 不存在则创建(upsert)
POST /articles/_update/1
{
"doc": { "views": 200 },
"upsert": {
"title": "默认标题",
"views": 200,
"author": "unknown"
}
}
# 脚本更新(Painless 脚本)
POST /articles/_update/1
{
"script": {
"source": "ctx._source.views += params.increment",
"lang": "painless",
"params": { "increment": 1 }
}
}
# 条件更新(按查询更新)
POST /articles/_update_by_query
{
"query": { "term": { "author": "Alice" } },
"script": {
"source": "ctx._source.verified = true",
"lang": "painless"
}
}
# 完全替换(PUT,整个文档替换)
PUT /articles/_doc/1
{
"title": "全新内容(原字段全部丢失)",
"author": "Bob"
}
3.4 删除文档¶
# 按 ID 删除
DELETE /articles/_doc/1
# 按查询删除
POST /articles/_delete_by_query
{
"query": {
"range": {
"publish_at": { "lt": "2020-01-01" }
}
}
}
# 异步删除(大量数据时)
POST /articles/_delete_by_query?wait_for_completion=false
{
"query": { "match_all": {} }
}
3.5 批量操作(Bulk)¶
# bulk API(每两行为一组:操作行 + 数据行)
POST /_bulk
{ "index": { "_index": "articles", "_id": "1" } }
{ "title": "文章1", "author": "Alice" }
{ "index": { "_index": "articles", "_id": "2" } }
{ "title": "文章2", "author": "Bob" }
{ "update": { "_index": "articles", "_id": "1" } }
{ "doc": { "views": 100 } }
{ "delete": { "_index": "articles", "_id": "3" } }
# 同一索引的 bulk
POST /articles/_bulk
{ "index": { "_id": "1" } }
{ "title": "文章1", "author": "Alice" }
{ "create": { "_id": "2" } }
{ "title": "文章2" }
{ "update": { "_id": "1" } }
{ "doc": { "views": 100 } }
{ "delete": { "_id": "3" } }
建议每批 5~15MB,5000~10000 条,过大会占用大量内存。
四、查询 DSL 篇¶
4.1 全文查询¶
# match(标准全文查询,会分词)
GET /articles/_search
{
"query": {
"match": {
"title": "Elasticsearch 搜索"
# 默认 OR,匹配任意词
}
}
}
# match(AND 模式,全部词必须匹配)
{
"query": {
"match": {
"title": {
"query": "Elasticsearch 搜索",
"operator": "and"
}
}
}
}
# match_phrase(短语匹配,词序一致,位置相邻)
{
"query": {
"match_phrase": {
"title": {
"query": "全文搜索引擎",
"slop": 1 # 允许词间距离
}
}
}
}
# match_phrase_prefix(前缀短语匹配,搜索补全)
{
"query": {
"match_phrase_prefix": {
"title": "Elastic"
}
}
}
# multi_match(多字段匹配)
{
"query": {
"multi_match": {
"query": "Elasticsearch 教程",
"fields": ["title^3", "content", "tags"], # ^3 表示 title 权重×3
"type": "best_fields"
# best_fields: 取最高分字段的分数(默认)
# most_fields: 所有字段分数相加
# cross_fields: 跨字段匹配(适合姓名等)
# phrase: 短语匹配
}
}
}
# query_string(支持 Lucene 语法)
{
"query": {
"query_string": {
"query": "title:(Elasticsearch AND 入门) AND author:Alice",
"default_field": "content"
}
}
}
# simple_query_string(用户输入,容错性强)
{
"query": {
"simple_query_string": {
"query": "Elasticsearch +入门 -过时",
"fields": ["title", "content"],
"default_operator": "AND"
}
}
}
4.2 精确查询¶
# term(精确匹配,不分词,用于 keyword/数字/布尔)
{
"query": {
"term": {
"author": { "value": "Alice" }
}
}
}
# terms(IN 查询)
{
"query": {
"terms": {
"tags": ["搜索", "大数据", "Python"]
}
}
}
# ids(按 ID 批量查询)
{
"query": {
"ids": { "values": ["1", "2", "3"] }
}
}
# exists(字段存在)
{
"query": {
"exists": { "field": "tags" }
}
}
# prefix(前缀匹配,keyword 字段)
{
"query": {
"prefix": {
"title.keyword": { "value": "Elastic" }
}
}
}
# wildcard(通配符,* 任意多字符,? 单个字符)
{
"query": {
"wildcard": {
"title.keyword": { "value": "Elastic*" }
}
}
}
# regexp(正则匹配,性能差慎用)
{
"query": {
"regexp": {
"email": { "value": ".*@gmail\\.com" }
}
}
}
# fuzzy(模糊匹配,处理拼写错误)
{
"query": {
"fuzzy": {
"title": {
"value": "Elasticsearh", # 拼写错误
"fuzziness": "AUTO", # 自动(0/1/2)
"prefix_length": 2 # 前N位不模糊
}
}
}
}
4.3 范围查询¶
# range(范围查询)
{
"query": {
"range": {
"views": {
"gte": 100,
"lte": 10000
}
}
}
}
# 日期范围
{
"query": {
"range": {
"publish_at": {
"gte": "2024-01-01",
"lt": "2025-01-01",
"format": "yyyy-MM-dd",
"time_zone": "+08:00"
}
}
}
}
# 相对日期
{
"query": {
"range": {
"publish_at": {
"gte": "now-7d/d", # 7天前,取整到天
"lte": "now/d" # 今天
}
}
}
}
4.4 复合查询(bool)¶
# bool 查询(最常用的复合查询)
{
"query": {
"bool": {
"must": [ # 必须匹配(影响相关性分数)
{ "match": { "title": "Elasticsearch" } },
{ "term": { "status": "published" } }
],
"must_not": [ # 必须不匹配
{ "term": { "author": "spam_user" } }
],
"should": [ # 可选匹配(匹配则分数更高)
{ "term": { "tags": "推荐" } },
{ "range": { "views": { "gte": 1000 } } }
],
"minimum_should_match": 1, # should 至少匹配1个
"filter": [ # 过滤(不影响分数,有缓存)
{ "term": { "is_active": true } },
{ "range": { "views": { "gte": 10 } } }
]
}
}
}
must / filter / should / must_not 对比
子句 |
影响相关性分数 |
是否必须匹配 |
是否缓存 |
|---|---|---|---|
|
✅ |
✅ |
❌ |
|
❌ |
✅ |
✅(推荐用于过滤条件) |
|
✅ |
❌ |
❌ |
|
❌ |
必须不匹配 |
✅ |
4.5 嵌套查询(nested)¶
# 查询 nested 类型的字段(必须用 nested query)
{
"query": {
"nested": {
"path": "specs",
"query": {
"bool": {
"must": [
{ "term": { "specs.name": "颜色" } },
{ "term": { "specs.value": "红色" } }
]
}
},
"score_mode": "avg" # avg / max / sum / none
}
}
}
4.6 地理位置查询¶
# 圆形范围查询
{
"query": {
"geo_distance": {
"distance": "5km",
"location": {
"lat": 39.90,
"lon": 116.40
}
}
}
}
# 矩形范围查询
{
"query": {
"geo_bounding_box": {
"location": {
"top_left": { "lat": 40.0, "lon": 116.0 },
"bottom_right": { "lat": 39.5, "lon": 117.0 }
}
}
}
}
# 距离排序
{
"sort": [{
"_geo_distance": {
"location": { "lat": 39.90, "lon": 116.40 },
"order": "asc",
"unit": "km"
}
}]
}
五、分词篇¶
5.1 内置分析器¶
分析器 |
说明 |
|---|---|
|
默认,按词边界分词,小写,适合英文 |
|
按非字母字符分词,小写 |
|
按空格分词,不小写 |
|
standard + 停用词过滤 |
|
不分词,整体作为一个词条 |
|
正则分词 |
|
语言特定(english/french 等) |
|
去重排序后合并 |
# 测试分析器
GET /_analyze
{
"analyzer": "standard",
"text": "Elasticsearch 全文搜索引擎"
}
# 测试指定索引的分析器
GET /articles/_analyze
{
"field": "title",
"text": "Elasticsearch 全文搜索"
}
5.2 中文分词(IK)¶
# 安装 IK 分词插件(版本需与 ES 一致)
bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v8.12.0/elasticsearch-analysis-ik-8.12.0.zip
# 或 Docker 内安装
docker exec -it elasticsearch ./bin/elasticsearch-plugin install analysis-ik
# 重启 ES 后生效
# IK 两种模式
# ik_max_word:细粒度分词(索引时用,最多词条)
# ik_smart:粗粒度分词(搜索时用,语义更准确)
GET /_analyze
{
"analyzer": "ik_max_word",
"text": "中华人民共和国国歌"
}
# → [中华人民共和国, 中华人民, 中华, 华人, 人民共和国, 人民, 共和国, 共和, 国歌]
GET /_analyze
{
"analyzer": "ik_smart",
"text": "中华人民共和国国歌"
}
# → [中华人民共和国, 国歌]
# 自定义词典(热更新)
# elasticsearch/config/analysis-ik/IKAnalyzer.cfg.xml
# <entry key="remote_ext_dict">http://yourserver/dict.txt</entry>
5.3 自定义分析器¶
PUT /my_index
{
"settings": {
"analysis": {
"char_filter": {
"html_strip": { "type": "html_strip" }, # 去除 HTML 标签
"replace_and": {
"type": "mapping",
"mappings": ["& => and", "| => or"]
}
},
"tokenizer": {
"comma_tokenizer": {
"type": "pattern",
"pattern": "," # 按逗号分词
}
},
"filter": {
"my_stop": {
"type": "stop",
"stopwords": ["的", "了", "是", "在", "我", "有", "和"] # 停用词
},
"my_synonym": {
"type": "synonym",
"synonyms": ["手机,mobile => 手机", "电脑,PC,计算机"] # 同义词
},
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20 # 前缀搜索补全
}
},
"analyzer": {
"my_cn_analyzer": {
"type": "custom",
"char_filter": ["html_strip"],
"tokenizer": "ik_max_word",
"filter": ["lowercase", "my_stop", "my_synonym"]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "edge_ngram_filter"]
}
},
"normalizer": { # keyword 字段的 normalizer(类似 analyzer 但不分词)
"lowercase_normalizer": {
"type": "custom",
"filter": ["lowercase", "asciifolding"]
}
}
}
}
}
六、聚合篇¶
6.1 Bucket 聚合(分组)¶
# terms(分组计数,类似 GROUP BY)
GET /articles/_search
{
"size": 0, # 不返回文档,只返回聚合结果
"aggs": {
"by_author": {
"terms": {
"field": "author",
"size": 10, # 返回前10个桶
"order": { "_count": "desc" }
}
}
}
}
# date_histogram(按时间分组)
{
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "publish_at",
"calendar_interval": "month", # month/week/day/hour/minute
"format": "yyyy-MM",
"min_doc_count": 0, # 空桶也返回
"extended_bounds": {
"min": "2024-01-01",
"max": "2024-12-31"
}
}
}
}
}
# histogram(数值直方图)
{
"aggs": {
"price_ranges": {
"histogram": {
"field": "price",
"interval": 100,
"min_doc_count": 0
}
}
}
}
# range(自定义范围分组)
{
"aggs": {
"price_range": {
"range": {
"field": "price",
"ranges": [
{ "to": 100 },
{ "from": 100, "to": 500 },
{ "from": 500, "key": "high_end" }
]
}
}
}
}
# filter(过滤聚合)
{
"aggs": {
"recent_articles": {
"filter": {
"range": { "publish_at": { "gte": "now-30d" } }
},
"aggs": {
"by_author": {
"terms": { "field": "author" }
}
}
}
}
}
6.2 Metric 聚合(统计)¶
{
"aggs": {
"avg_views": { "avg": { "field": "views" } },
"max_views": { "max": { "field": "views" } },
"min_views": { "min": { "field": "views" } },
"sum_views": { "sum": { "field": "views" } },
"total_docs": { "value_count": { "field": "views" } },
# stats:一次性获取 count/min/max/avg/sum
"views_stats": { "stats": { "field": "views" } },
# extended_stats:含方差/标准差
"views_ext": { "extended_stats": { "field": "views" } },
# percentiles:百分位数
"views_percentiles": {
"percentiles": {
"field": "views",
"percents": [50, 75, 90, 95, 99]
}
},
# cardinality:基数(近似去重计数)
"unique_authors": {
"cardinality": {
"field": "author",
"precision_threshold": 100 # 精度,越大越准但内存越多
}
},
# top_hits:每个分组的 top N 文档
"top_articles": {
"top_hits": {
"size": 3,
"_source": ["title", "author"],
"sort": [{ "views": { "order": "desc" } }]
}
}
}
}
6.3 嵌套聚合¶
# 先分组,再统计(最常用模式)
{
"size": 0,
"aggs": {
"by_category": {
"terms": { "field": "category", "size": 10 },
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"max_price": { "max": { "field": "price" } },
"top_products": {
"top_hits": {
"size": 3,
"_source": ["name", "price"],
"sort": [{ "price": { "order": "desc" } }]
}
}
}
}
}
}
# 先过滤再聚合(query + aggs)
{
"query": {
"bool": {
"filter": [
{ "term": { "is_active": true } },
{ "range": { "publish_at": { "gte": "2024-01-01" } } }
]
}
},
"size": 0,
"aggs": {
"by_author": {
"terms": { "field": "author", "size": 5 }
}
}
}
6.4 Pipeline 聚合¶
# 对聚合结果再聚合
{
"size": 0,
"aggs": {
"monthly_sales": {
"date_histogram": {
"field": "order_date",
"calendar_interval": "month"
},
"aggs": {
"total_amount": { "sum": { "field": "amount" } },
# 计算环比增长率
"sales_growth": {
"derivative": {
"buckets_path": "total_amount"
}
}
}
},
# 所有月份中的最大销售额
"best_month": {
"max_bucket": {
"buckets_path": "monthly_sales>total_amount"
}
},
# 移动平均
"avg_sales": {
"moving_avg": {
"buckets_path": "monthly_sales>total_amount",
"window": 3
}
}
}
}
七、搜索进阶篇¶
7.1 相关性评分¶
# 自定义评分(function_score)
{
"query": {
"function_score": {
"query": { "match": { "title": "Elasticsearch" } },
"functions": [
{
# 按字段值加权
"field_value_factor": {
"field": "views",
"factor": 0.1,
"modifier": "log1p", # log1p(views * 0.1)
"missing": 1
}
},
{
# 时间衰减(越新分越高)
"gauss": {
"publish_at": {
"origin": "now",
"scale": "30d",
"offset": "7d",
"decay": 0.5
}
}
},
{
# 固定加分
"filter": { "term": { "is_featured": true } },
"weight": 5
}
],
"score_mode": "sum", # 多个 function 得分如何合并:sum/avg/max/min/multiply
"boost_mode": "sum" # function 得分与 query 得分如何合并
}
}
}
# 固定分数(constant_score,不计算相关性)
{
"query": {
"constant_score": {
"filter": { "term": { "status": "published" } },
"boost": 1.0
}
}
}
7.2 高亮显示¶
{
"query": { "match": { "content": "Elasticsearch 搜索" } },
"highlight": {
"pre_tags": ["<em>"],
"post_tags": ["</em>"],
"fields": {
"title": { "number_of_fragments": 0 }, # 0=返回完整字段
"content": {
"fragment_size": 150, # 片段大小(字符)
"number_of_fragments": 3, # 返回片段数
"order": "score" # 按相关性排序
}
},
"require_field_match": false # false=所有字段都高亮(即使不是查询字段)
}
}
7.3 分页与深度分页¶
# 普通分页(from + size,最多 10000 条)
{
"from": 0,
"size": 20,
"query": { "match_all": {} }
}
# 深度分页方案1:search_after(游标分页,推荐)
# 第一页
{
"size": 20,
"sort": [
{ "publish_at": "desc" },
{ "_id": "asc" } # 必须包含唯一字段保证稳定性
],
"query": { "match_all": {} }
}
# 取最后一条记录的 sort 值,作为下一页的 search_after
{
"size": 20,
"sort": [{ "publish_at": "desc" }, { "_id": "asc" }],
"search_after": ["2024-01-15T10:00:00", "abc123"]
}
# 深度分页方案2:scroll(导出大量数据,不适合实时搜索)
# 初始化 scroll
POST /articles/_search?scroll=1m # 保持1分钟
{
"size": 1000,
"query": { "match_all": {} },
"sort": ["_doc"] # 按磁盘顺序,最快
}
# 继续滚动(使用返回的 _scroll_id)
POST /_search/scroll
{
"scroll": "1m",
"scroll_id": "DXF1ZXJ5QW5kRmV0Y2..."
}
# 清除 scroll
DELETE /_search/scroll
{
"scroll_id": "DXF1ZXJ5QW5kRmV0Y2..."
}
# 修改最大 from+size 限制(不推荐)
PUT /articles/_settings
{
"max_result_window": 50000
}
7.4 搜索建议(Suggest)¶
# term suggest(拼写纠错)
{
"suggest": {
"title_suggest": {
"text": "Elasticsearh", # 输入(含拼写错误)
"term": {
"field": "title",
"suggest_mode": "popular", # missing/popular/always
"max_edits": 2,
"sort": "frequency"
}
}
}
}
# phrase suggest(短语纠错)
{
"suggest": {
"phrase_suggest": {
"text": "Elasticsearch 全文搜索教程",
"phrase": {
"field": "title",
"gram_size": 3,
"highlight": {
"pre_tag": "<em>",
"post_tag": "</em>"
}
}
}
}
}
# completion suggest(自动补全,最快,需 completion 类型字段)
{
"suggest": {
"title_autocomplete": {
"prefix": "Ela",
"completion": {
"field": "suggest",
"size": 10,
"skip_duplicates": true,
"fuzzy": {
"fuzziness": 1
}
}
}
}
}
八、索引优化篇¶
8.1 写入优化¶
# 批量写入(bulk)
# 建议单批 5~15MB,5000~10000 条
# 临时禁用副本(写入期间)
PUT /my_index/_settings
{ "number_of_replicas": 0 }
# 写入完成后恢复
PUT /my_index/_settings
{ "number_of_replicas": 1 }
# 延长刷新间隔(减少 Segment 生成频率)
PUT /my_index/_settings
{ "refresh_interval": "30s" }
# 初始化大量数据时
PUT /my_index/_settings
{
"refresh_interval": "-1", # 禁用自动刷新
"number_of_replicas": 0
}
# 导入完成后
POST /my_index/_refresh
PUT /my_index/_settings
{
"refresh_interval": "1s",
"number_of_replicas": 1
}
# translog 配置(异步写,性能好但可能丢失)
PUT /my_index/_settings
{
"translog": {
"durability": "async",
"sync_interval": "30s",
"flush_threshold_size": "512mb"
}
}
8.2 查询优化¶
# 1. filter 代替 query(可缓存,无评分计算)
# 有相关性排序需求 → must/should
# 只做过滤 → filter/must_not
# 2. 避免 wildcard/regexp(性能差)
# 改用 ngram tokenizer 或 prefix 查询
# 3. 避免深度 from 翻页,改用 search_after
# 4. 只返回需要的字段
{
"_source": ["title", "author", "publish_at"]
}
# 5. 使用 routing 路由到特定分片
PUT /articles/_doc/1?routing=alice
GET /articles/_search?routing=alice
{
"query": { "term": { "author": "alice" } }
}
# 6. 强制合并(只对只读索引使用,合并小 Segment)
POST /articles/_forcemerge?max_num_segments=1
# 7. 查看查询计划
GET /articles/_search
{
"explain": true,
"query": { "match": { "title": "Elasticsearch" } }
}
# 8. Profile API(分析查询性能)
GET /articles/_search
{
"profile": true,
"query": { "match": { "title": "Elasticsearch" } }
}
8.3 索引生命周期(ILM)¶
# 定义 ILM 策略(适合日志、时序数据)
PUT /_ilm/policy/logs_policy
{
"policy": {
"phases": {
"hot": {
"min_age": "0ms",
"actions": {
"rollover": {
"max_size": "50gb",
"max_age": "7d",
"max_docs": 1000000
},
"set_priority": { "priority": 100 }
}
},
"warm": {
"min_age": "7d",
"actions": {
"shrink": { "number_of_shards": 1 },
"forcemerge": { "max_num_segments": 1 },
"set_priority": { "priority": 50 }
}
},
"cold": {
"min_age": "30d",
"actions": {
"freeze": {},
"set_priority": { "priority": 0 }
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
}
# 在索引模板中应用 ILM
PUT /_index_template/logs_template
{
"index_patterns": ["logs-*"],
"template": {
"settings": {
"index.lifecycle.name": "logs_policy",
"index.lifecycle.rollover_alias": "logs"
}
}
}
九、集群篇¶
9.1 节点类型¶
节点角色 |
配置 |
说明 |
|---|---|---|
Master |
|
集群管理(选举、节点加入、分片分配) |
Data |
|
存储数据、执行查询 |
Data Hot |
|
热数据节点(高性能 SSD) |
Data Warm |
|
温数据节点(普通磁盘) |
Data Cold |
|
冷数据节点(对象存储) |
Coordinating |
|
协调节点(路由请求,不存数据) |
Ingest |
|
数据预处理 |
ML |
|
机器学习 |
9.2 分片策略¶
# 分片数量建议
# - 单分片大小控制在 10~50GB
# - 单节点分片数不超过 20个/GB 堆内存
# - 节点数 × 20 = 最大分片数
# 查看分片分布
GET /_cat/shards/my_index?v
# 分片分配控制
PUT /my_index/_settings
{
"index.routing.allocation.require.box_type": "hot", # 只在 hot 节点
"index.routing.allocation.exclude._name": "node-1", # 排除 node-1
"index.number_of_replicas": 1
}
# 手动迁移分片
POST /_cluster/reroute
{
"commands": [{
"move": {
"index": "my_index",
"shard": 0,
"from_node": "node-1",
"to_node": "node-2"
}
}]
}
9.3 集群监控¶
# 集群健康(green/yellow/red)
GET /_cluster/health?pretty
GET /_cluster/health/my_index
# 集群统计
GET /_cluster/stats?pretty
# 节点统计
GET /_nodes/stats?pretty
GET /_nodes/stats/jvm,os,process
# 索引统计
GET /my_index/_stats?pretty
GET /my_index/_stats/indexing,search
# 慢查询日志
PUT /my_index/_settings
{
"index.search.slowlog.threshold.query.warn": "10s",
"index.search.slowlog.threshold.query.info": "5s",
"index.search.slowlog.threshold.fetch.warn": "1s",
"index.indexing.slowlog.threshold.index.warn": "10s"
}
# 热点线程(查看 CPU 热点)
GET /_nodes/hot_threads
# 任务管理
GET /_tasks
GET /_tasks?actions=*search&detailed=true
POST /_tasks/<task_id>/_cancel
十、Python 客户端篇¶
10.1 安装与连接¶
pip install elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
# 连接(ES 8.x)
es = Elasticsearch(
hosts=['http://localhost:9200'],
# 如果开启了安全认证
# http_auth=('elastic', 'password'),
# scheme='https',
# verify_certs=False,
request_timeout=30,
max_retries=3,
retry_on_timeout=True,
)
# 验证连接
print(es.ping())
print(es.info())
10.2 索引操作¶
# 创建索引
es.indices.create(
index='articles',
body={
'settings': {
'number_of_shards': 3,
'number_of_replicas': 1,
'analysis': {
'analyzer': {
'ik_analyzer': {'type': 'custom', 'tokenizer': 'ik_max_word'}
}
}
},
'mappings': {
'properties': {
'title': {'type': 'text', 'analyzer': 'ik_max_word'},
'author': {'type': 'keyword'},
'content': {'type': 'text', 'analyzer': 'ik_max_word'},
'tags': {'type': 'keyword'},
'views': {'type': 'integer'},
'publish_at': {'type': 'date', 'format': 'yyyy-MM-dd HH:mm:ss||epoch_millis'},
}
}
}
)
# 检查索引是否存在
es.indices.exists(index='articles')
# 删除索引
es.indices.delete(index='articles', ignore=[400, 404])
# 更新 Mapping
es.indices.put_mapping(index='articles', body={
'properties': {
'new_field': {'type': 'keyword'}
}
})
# 刷新索引(使写入立即可查)
es.indices.refresh(index='articles')
10.3 文档 CRUD¶
from datetime import datetime
# 新增文档
es.index(
index='articles',
id=1,
document={
'title': 'Elasticsearch Python 教程',
'author': 'Alice',
'content': '本文介绍如何使用 Python 操作 Elasticsearch',
'tags': ['Python', 'Elasticsearch'],
'views': 0,
'publish_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
}
)
# 获取文档
doc = es.get(index='articles', id=1)
print(doc['_source'])
# 检查文档是否存在
es.exists(index='articles', id=1)
# 更新文档(部分更新)
es.update(
index='articles',
id=1,
doc={'views': 100, 'title': '更新后的标题'}
)
# 脚本更新
es.update(
index='articles',
id=1,
script={
'source': 'ctx._source.views += params.increment',
'lang': 'painless',
'params': {'increment': 1}
}
)
# 删除文档
es.delete(index='articles', id=1)
# 按查询删除
es.delete_by_query(
index='articles',
body={
'query': {'term': {'author': 'spam_user'}}
}
)
10.4 搜索¶
# 基础搜索
result = es.search(
index='articles',
body={
'query': {
'bool': {
'must': [{'match': {'title': 'Elasticsearch'}}],
'filter': [{'term': {'author': 'Alice'}}]
}
},
'highlight': {
'fields': {'title': {}, 'content': {'fragment_size': 150}}
},
'sort': [{'views': 'desc'}, {'_score': 'desc'}],
'from': 0,
'size': 20,
'_source': ['title', 'author', 'views', 'publish_at']
}
)
# 处理结果
total = result['hits']['total']['value']
hits = result['hits']['hits']
for hit in hits:
doc = hit['_source']
score = hit['_score']
highlight = hit.get('highlight', {})
print(f"[{score:.2f}] {doc['title']} - {highlight.get('title', [])}")
print(f"总数: {total}")
# 使用 search_after 翻页
def paginate_with_search_after(es, index, query, size=20):
search_after = None
while True:
body = {
'query': query,
'sort': [{'publish_at': 'desc'}, {'_id': 'asc'}],
'size': size,
}
if search_after:
body['search_after'] = search_after
result = es.search(index=index, body=body)
hits = result['hits']['hits']
if not hits:
break
yield [h['_source'] for h in hits]
search_after = hits[-1]['sort']
10.5 聚合¶
result = es.search(
index='articles',
body={
'size': 0,
'aggs': {
'by_author': {
'terms': {'field': 'author', 'size': 10},
'aggs': {
'avg_views': {'avg': {'field': 'views'}},
'top_articles': {
'top_hits': {
'size': 3,
'_source': ['title', 'views'],
'sort': [{'views': 'desc'}]
}
}
}
},
'monthly': {
'date_histogram': {
'field': 'publish_at',
'calendar_interval': 'month',
'format': 'yyyy-MM'
}
}
}
}
)
# 处理聚合结果
for bucket in result['aggregations']['by_author']['buckets']:
author = bucket['key']
count = bucket['doc_count']
avg_views = bucket['avg_views']['value']
print(f'{author}: {count} 篇文章,平均阅读 {avg_views:.0f}')
10.6 批量操作¶
from elasticsearch.helpers import bulk, streaming_bulk, parallel_bulk
# 方式1:bulk(推荐,一次性)
def generate_actions(data_list):
for item in data_list:
yield {
'_index': 'articles',
'_id': item['id'],
'_source': {
'title': item['title'],
'author': item['author'],
'content': item['content'],
}
}
success, failed = bulk(
es,
generate_actions(data_list),
chunk_size=500,
request_timeout=60
)
print(f'成功: {success}, 失败: {len(failed)}')
# 方式2:streaming_bulk(流式,节省内存)
for ok, result in streaming_bulk(
es,
generate_actions(data_list),
chunk_size=500,
raise_on_error=False
):
if not ok:
print(f'写入失败: {result}')
# 方式3:parallel_bulk(并行,最快)
from elasticsearch.helpers import parallel_bulk
from collections import deque
deque(
parallel_bulk(es, generate_actions(data_list), thread_count=4),
maxlen=0
)
# 批量扫描(大量读取)
from elasticsearch.helpers import scan
for doc in scan(
es,
index='articles',
query={'query': {'match_all': {}}},
scroll='5m',
size=500
):
process(doc['_source'])
十一、运维与监控篇¶
11.1 常用运维命令¶
# 集群健康检查
GET /_cluster/health?wait_for_status=green&timeout=30s
# 查看未分配分片原因
GET /_cluster/allocation/explain?pretty
# 手动触发分片分配
POST /_cluster/reroute?retry_failed=true
# 清除缓存
POST /my_index/_cache/clear
POST /_cache/clear
# 刷新(使内存数据持久化,但慎用,影响性能)
POST /my_index/_flush
# 强制合并(减少 Segment,提高查询速度)
POST /my_index/_forcemerge?max_num_segments=1
# 查看索引磁盘占用
GET /_cat/indices?v&s=store.size:desc
# 快照(备份)
PUT /_snapshot/my_backup
{
"type": "fs",
"settings": { "location": "/backup/elasticsearch" }
}
POST /_snapshot/my_backup/snapshot_1?wait_for_completion=true
{
"indices": "articles,products",
"ignore_unavailable": true
}
# 恢复快照
POST /_snapshot/my_backup/snapshot_1/_restore
{
"indices": "articles",
"rename_pattern": "(.+)",
"rename_replacement": "restored_$1"
}
11.2 JVM 调优¶
# jvm.options
# 堆内存设置(不超过系统内存50%,最大不超过32GB)
-Xms4g
-Xmx4g
# GC 配置(ES 8.x 默认 G1GC)
-XX:+UseG1GC
-XX:G1HeapRegionSize=4m
-XX:InitiatingHeapOccupancyPercent=30
# elasticsearch.yml
cluster.name: my-cluster
node.name: node-1
path.data: /var/data/elasticsearch
path.logs: /var/log/elasticsearch
network.host: 0.0.0.0
http.port: 9200
transport.port: 9300
# 集群发现
discovery.seed_hosts: ["node-1", "node-2", "node-3"]
cluster.initial_master_nodes: ["node-1", "node-2", "node-3"]
# 内存锁定(防止 Swap)
bootstrap.memory_lock: true
十二、部署篇¶
12.1 Docker Compose(ES + Kibana)¶
# docker-compose.yml
version: '3.8'
services:
elasticsearch:
image: elasticsearch:8.12.0
environment:
- cluster.name=my-cluster
- node.name=es-node-1
- discovery.type=single-node
- bootstrap.memory_lock=true
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms2g -Xmx2g
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- es_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
- "9300:9300"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health"]
interval: 30s
timeout: 10s
retries: 5
kibana:
image: kibana:8.12.0
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
ports:
- "5601:5601"
depends_on:
elasticsearch:
condition: service_healthy
volumes:
es_data:
12.2 三节点集群¶
# docker-compose-cluster.yml
version: '3.8'
services:
es01:
image: elasticsearch:8.12.0
environment:
- node.name=es01
- cluster.name=my-cluster
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es02,es03
- node.roles=master,data
- ES_JAVA_OPTS=-Xms2g -Xmx2g
- xpack.security.enabled=false
volumes:
- es01_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
es02:
image: elasticsearch:8.12.0
environment:
- node.name=es02
- cluster.name=my-cluster
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es03
- node.roles=master,data
- ES_JAVA_OPTS=-Xms2g -Xmx2g
- xpack.security.enabled=false
volumes:
- es02_data:/usr/share/elasticsearch/data
es03:
image: elasticsearch:8.12.0
environment:
- node.name=es03
- cluster.name=my-cluster
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es02
- node.roles=master,data
- ES_JAVA_OPTS=-Xms2g -Xmx2g
- xpack.security.enabled=false
volumes:
- es03_data:/usr/share/elasticsearch/data
kibana:
image: kibana:8.12.0
environment:
- ELASTICSEARCH_HOSTS=http://es01:9200
ports:
- "5601:5601"
depends_on:
- es01
volumes:
es01_data:
es02_data:
es03_data:
常用 DSL 速查¶
# 查询所有
{ "query": { "match_all": {} } }
# 按关键词搜索
{ "query": { "match": { "title": "关键词" } } }
# 精确匹配
{ "query": { "term": { "author": "Alice" } } }
# 多条件 AND
{ "query": { "bool": { "must": [ {...}, {...} ] } } }
# 过滤(无评分)
{ "query": { "bool": { "filter": [ {...} ] } } }
# 范围
{ "query": { "range": { "views": { "gte": 100 } } } }
# 排序 + 分页
{ "sort": [{ "views": "desc" }], "from": 0, "size": 20 }
# 指定返回字段
{ "_source": ["title", "author"] }
# 聚合
{ "size": 0, "aggs": { "名称": { "terms": { "field": "category" } } } }