本文最后更新于 1123 天前,其中的信息可能已经有所发展或是发生改变。
数据准备
详情参考《tmdb_5000_movies.csv》
分词器 (analyze)
分词过程:字符过滤器 → 字符处理 → 分词过滤
标准分词器 (standard)
具体实现:去除特殊符号和标点 → 空格分词 → 变小写
//text默认使用标准分词器
PUT /movie/_doc/1
{
"name": "Eating an apple a day & keeps the doctor away"
}
英文分词器 (english)
具体实现:去除特殊符号和量词(the)等 → 空格分词 → 词干转化(去除复数,现在进行时等)
PUT /movie
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"name": {"type": "text", "analyzer": "english", "search_analyzer": "standard"}
}
}
}
中文分词器
具体实现:去除特殊符号和量词等(stopword) → 词库词典匹配 → 词干转化(不排除中文夹带英文)
PUT /movie
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"name": {"type": "text", "analyzer": "ik_max_word"}
}
}
}
安装 IK 分词器
bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.13.4/elasticsearch-analysis-ik-7.13.4.zip #建议版本要和elasticsearch保持一致
#离线安装只需将下载好的zip文件解压到plugins目录即可
IK 分词法 (贪心算法)
| 分词法 | 说明 |
|---|---|
| ik_smart | 智能分词法,尽可能少分词 |
| ik_max_word | 最大化分词,尽可能多分词 |
IK 词库
cd config/analysis-ik
# stopword.dic 英文stopword
# extra_stopword.dic 中文stopword
# main.dic 主词库(汉语词典,成语等)
# extra_main.dic 额外的词库(学校名,流行语等)
分析关键字
//指定分词器
GET _analyze?pretty
{
"analyzer": "ik_smart",
"text": "中华人民共和国国歌"
}
//指定字段
GET /movie/_analyze
{
"field": "name",
"text": "Eating an apple a day & keeps the doctor away"
}
match & term
//match会对查询关键字进行分词,再去字段中进行匹配
GET /movie/_search
{
"query": {
"match": {
"title": "steven zissou"
}
}
}
//term不会分词
GET /movie/_search
{
"query": {
"term": {
"title": "steven zissou"
}
}
}
短语查询 (match_phrase)
//与match类似,短语查询也会对查询关键字进行分词,再去字段中进行匹配。但短语查询只保留包含所有搜索词条的文档,且词条位置还要邻接。
GET /movie/_search
{
"query": {
"match_phrase": {
"title": "basketball with cartoom aliens"
}
}
}
or & and (operator)
//match默认使用or
GET /movie/_search
{
"query": {
"match": {
"title": "basketball with cartoom aliens"
}
}
}
//改用and
GET /movie/_search
{
"query": {
"match": {
"title": {
"query": "basketball with cartoom aliens",
"operator": "and"
}
}
}
}
最小词匹配项 (minimum_should_match)
GET /movie/_search
{
"query": {
"match": {
"title": {
"query": "basketball with cartoom aliens",
"operator": "and",
"minimum_should_match": 2 //分词后,最起码匹配2个
}
}
}
}
多字段查询 (multi_match)
GET /movie/_search
{
"query": {
"multi_match": {
"query": "basketball with cartoom aliens",
"fields": ["title", "overview"]
}
}
}
得分算法 (TF-IDF)
TF:关键词在字段中出现的次数
IDF:包含关键词的文档数,取反
tfnorm:词频归一化
分析得分情况 (explain)
GET /movie._search
{
"explain": true,
"query": {
"match": {"title": "steve"}
}
}
优化多字段查询
//max of 以各字段最大得分为文档得分
GET /movie/_search
{
"query": {
"multi_match": {
"query": "basketball with cartoom aliens",
"fields": ["title^10", "overview"], //title字段得分乘以10,权重更大
"tie_breaker": 0.3 //取值在0-1之间,文档得分情况:(各字段最大得分) + (其它字段得分总和) * 0.3
}
}
}
得分方式 (type)
GET /movie/_search
{
"query": {
"multi_match": {
"query": "basketball with cartoom aliens",
"fields": ["title", "overview"],
"type": "best_fields"
}
}
}
| 类型 | 说明 |
|---|---|
| best_fields | 默认,取字段最高分作为文档得分 |
| most_fields | 将所有字段得分相加作为文档得分 |
| cross_fields | 将每个词在所有字段的最高分相加作为文档得分,以词为单位计算得分 |
分析得分方式
GET /movie/_validate/query?explain
{
"query": {
"multi_match": {
"query": "basketball with cartoom aliens",
"fields": ["title", "overview"],
"type": "best_fields"
}
}
}
条件查询 (bool)
//sum of 以各字段得分总和为文档得分
GET /movie/_search
{
"query": {
"bool": {
//must 都为true
//must not 都为false
//should 只需一个为true
"should": [
{"match": {"title": "basketball with cartoom aliens"}},
{"match": {"overview": "basketball with cartoom aliens"}}
]
}
}
}
字符串查询 (query_string)
GET /movie/_search
{
"query": {
"query_string": {
"query": "steve AND jobs" //AND OR NOT
"fields": ["title"]
}
}
}
查询过滤 (filter)
单条件过滤
GET /movie/_search
{
"query": {
"bool": {
"filter": {
"term": {"title": "steve"}
}
}
}
}
多条件过滤
GET /movie/_search
{
"query": {
"bool": {
"filter": [
{"match": {"title": "steve"}},
{"term": {"cast.name": "gaspard"}},
{"range": {"release_date": {"lte": "2015/01/01"}}},
{"range": {"popularity": {"gte": 25}}}
]
}
}
}
带得分的过滤
GET /movie/_search
{
"query": {
"bool": {
"should": [
{"match": {"title": "life"}}
],
"filter": [
{"match": {"title": "steve"}},
{"term": {"cast.name": "gaspard"}},
{"range": {"release_date": {"lte": "2015/01/01"}}},
{"range": {"popularity": {"gte": 25}}}
]
}
}
}
自定义得分算法 (function_score)
GET /movie/_search
{
"query": {
"function_score": {
"query": {
"multi_match": {
"query": "steve job",
"fields": ["title", "overview"],
"operator": "or",
"type": "most_fields"
}
}
},
"functions": [
{
"field_value_factor": {
"field": "popularity", //指定字段
"modifier": "log2p",
"factor": 10
}
}
],
//multiply:默认相乘,sum:相加,replace:忽略
"score_mode": "multiply", //functions得分相乘
"boost_mode": "sum" //functions计算的得分和function_score得分相加
}
}
自定义字段 (script_fields)
GET /shop/_search
{
"query": {
"match": {"name": "炸鸡"}
},
"_source": "*", //列出所有字段
"script_fields": {
"distance": { //自定义字段
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)", //haversin是elasticsearch内置的函数,用来计算经纬度距离(单位:公里)
"lang": "expression", //表达式
"params": {"lat": 39.90, "lon": 116.38} //入参
}
}
}
}
地理位置排序 (geo_distance)
GET /shop/_search
{
"query": {
"match": {"name": "炸鸡"}
},
"_source": "*", //列出所有字段
"sort": [
{
"_geo_distance": {
"location": { //指定目标经纬度字段
"lat": 39.90, //当前纬度
"lon": 116.38 //当前经度
},
"order": "asc",
"unit": "km", //单位
"distance_type": "arc" //弧形
}
}
]
}