0


玩转 StarRocks on ES 全文检索

部署ES

sr@cs02:~/app/elasticsearch-7.16.2$grep -v ^# config/elasticsearch.yml
node.name: node-2
network.host: 172.26.194.185
cluster.initial_master_nodes: ["node-2"]
  • 系统参数

vm.max_map_count = 655360
sr@cs02:~/app/elasticsearch-7.16.2$sudo vim /etc/sysctl.conf
sr@cs02:~/app/elasticsearch-7.16.2$sudo sysctl -p
vm.swappiness = 0
kernel.sysrq = 1
net.ipv4.neigh.default.gc_stale_time = 120
net.ipv4.conf.all.rp_filter = 0
net.ipv4.conf.default.rp_filter = 0
net.ipv4.conf.default.arp_announce = 2
net.ipv4.conf.lo.arp_announce = 2
net.ipv4.conf.all.arp_announce = 2
net.ipv4.tcp_max_tw_buckets = 5000
net.ipv4.tcp_syncookies = 1
net.ipv4.tcp_max_syn_backlog = 1024
net.ipv4.tcp_synack_retries = 2
vm.max_map_count = 655360
  • 启动

sr@cs02:~/app/elasticsearch-7.16.2$bin/elasticsearch -d
  • Verify

sr@cs02:~/app/elasticsearch-7.16.2$sudo netstat -lnpt |grep 9[2-3]00
tcp6       0      0 172.26.194.185:9200     :::*                    LISTEN      10442/java
tcp6       0      0 172.26.194.185:9300     :::*                    LISTEN      10442/java

SR 外表

  • 测试2:全文检索

插件

wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.16.2/elasticsearch-analysis-ik-7.16.2.zip
-- 解压到 es/plugin
-- 重启es
  • ES

curl -sH "Content-Type: application/json" -XPUT "cs02:9200/wa"  -d'
{
  "mappings": {

      "properties": {
        "allcol":{
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart",
          "type": "text"
        },
        "name": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart",          
           "copy_to" : [          
           "allcol"          
           ]
        },
        "site": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart",          
           "copy_to" : [          
           "allcol"          
           ]
        },
        "comment": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart",          
           "copy_to" : [          
           "allcol"          
           ]
        }
    }
  },
  "settings": {
    "index": {
      "number_of_replicas": "0",
      "number_of_shards": "1"
    }
  }
}
'
  • Post
curl -XPOST "http://cs02:9200/_bulk" -H 'Content-Type: application/json' -d'
{"index":{"_index":"wa"}}
{ "name": "老谢", "site": "www.starrocks.com","comment": "StarRocks极速MPP数据库"}
{"index":{"_index":"wa"}}
{ "name": "Simon", "site": "www.baidu.com","comment": "Simon accessed www.baidu.com"}
{"index":{"_index":"wa"}}
{ "name": "张三", "site": "www.google.com","comment": "今天天气不错"}
{"index":{"_index":"wa"}}
{ "name": "李四", "site": "docs.starrocks.com","comment": "李四比较懒,啥也没说"}
'
  • Search
sr@cs01:~$curl -s -XGET cs02:9200/wa/_search?pretty
{
  "took" : 818,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 4,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "wa",
        "_type" : "_doc",
        "_id" : "w27USX4B8p-MWLQe7Rj7",
        "_score" : 1.0,
        "_source" : {
          "name" : "老谢",
          "site" : "www.starrocks.com",
          "comment" : "StarRocks极速MPP数据库"
        }
      },
      {
        "_index" : "wa",
        "_type" : "_doc",
        "_id" : "xG7USX4B8p-MWLQe7Rj7",
        "_score" : 1.0,
        "_source" : {
          "name" : "Simon",
          "site" : "www.baidu.com",
          "comment" : "Simon accessed www.baidu.com"
        }
      },
      {
        "_index" : "wa",
        "_type" : "_doc",
        "_id" : "xW7USX4B8p-MWLQe7Rj7",
        "_score" : 1.0,
        "_source" : {
          "name" : "张三",
          "site" : "www.google.com",
          "comment" : "今天天气不错"
        }
      },
      {
        "_index" : "wa",
        "_type" : "_doc",
        "_id" : "xm7USX4B8p-MWLQe7Rj7",
        "_score" : 1.0,
        "_source" : {
          "name" : "李四",
          "site" : "docs.starrocks.com",
          "comment" : "李四比较懒,啥也没说"
        }
      }
    ]
  }
}

curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '
{
      "query":{
          "term":{
          "allcol":
              "比较" }
          }
 }'

sr@cs01:~/soft$curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '
> {
>       "query":{
>           "term":{
>           "allcol":
>               "比较" }
>           }
>  }'
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.1429836,
    "hits" : [
      {
        "_index" : "wa",
        "_type" : "_doc",
        "_id" : "ImUCSn4BHW9CsLnJXeFp",
        "_score" : 1.1429836,
        "_source" : {
          "name" : "李四",
          "site" : "docs.starrocks.com",
          "comment" : "李四比较懒,啥也没说"
        }
      }
    ]
  }
}
  • SR

USE simon;

-- 全文检索
CREATE EXTERNAL TABLE `soe_t2` (
 `allcol` varchar(65530) NULL COMMENT "全文检索字段",
 `name` varchar(65530) NULL COMMENT "",
 `site` varchar(65530) NULL COMMENT "",
 `comment` varchar(65530) NULL COMMENT ""
) ENGINE=ELASTICSEARCH
COMMENT "ELASTICSEARCH"
PROPERTIES (
"hosts" = "cs02:9200",
"index" = "wa",
"type" = "_doc",
"transport" = "http",
"enable_docvalue_scan" = "true",
"max_docvalue_fields" = "20",
"enable_keyword_sniff" = "true"
);
mysql> select * from soe_t2;
+--------+--------+--------------------+--------------------------------+
| allcol | name   | site               | comment                        |
+--------+--------+--------------------+--------------------------------+
| NULL   | 老谢   | www.starrocks.com  | StarRocks极速MPP数据库            |
| NULL   | Simon  | www.baidu.com      | Simon accessed www.baidu.com   |
| NULL   | 张三   | www.google.com     | 今天天气不错                   |
| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |
+--------+--------+--------------------+--------------------------------+
4 rows in set (0.01 sec)
  • NOTE: 如果用standard分词器,英文的可以查任意字段,但是中文不行
-- www未按预期分词
mysql> select * from soe_t2 where allcol = 'www';
Empty set (0.01 sec)

-- 标准分词器,中文不大行
mysql> select * from soe_t2 where allcol = '张三';
Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'simon';
+--------+-------+---------------+------------------------------+
| allcol | name  | site          | comment                      |
+--------+-------+---------------+------------------------------+
| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.33 sec)

mysql> select * from soe_t2 where allcol = 'baidu';
Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'accessed';
+--------+-------+---------------+------------------------------+
| allcol | name  | site          | comment                      |
+--------+-------+---------------+------------------------------+
| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.01 sec)

-- docs.starrocks.com, 没有按预期的分词出 docs
mysql> select * from soe_t2 where allcol = 'docs';
Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'docs.starrocks.com';
+--------+--------+--------------------+--------------------------------+
| allcol | name   | site               | comment                        |
+--------+--------+--------------------+--------------------------------+
| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |
+--------+--------+--------------------+--------------------------------+
1 row in set (0.01 sec)
  • 使用IK分词器,SR on ES利用copy_to allcol字段,实现全文检索效果!
mysql> select * from soe_t2 where allcol = 'docs';
+--------+--------+--------------------+--------------------------------+
| allcol | name   | site               | comment                        |
+--------+--------+--------------------+--------------------------------+
| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |
+--------+--------+--------------------+--------------------------------+
1 row in set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'baidu';
+--------+-------+---------------+------------------------------+
| allcol | name  | site          | comment                      |
+--------+-------+---------------+------------------------------+
| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.02 sec)

mysql> select * from soe_t2 where allcol = '张三';
+--------+--------+----------------+--------------------+
| allcol | name   | site           | comment            |
+--------+--------+----------------+--------------------+
| NULL   | 张三   | www.google.com | 今天天气不错       |
+--------+--------+----------------+--------------------+
1 row in set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'www';
+--------+--------+-------------------+------------------------------+
| allcol | name   | site              | comment                      |
+--------+--------+-------------------+------------------------------+
| NULL   | 老谢   | www.starrocks.com | StarRocks极速MPP数据库          |
| NULL   | Simon  | www.baidu.com     | Simon accessed www.baidu.com |
| NULL   | 张三   | www.google.com    | 今天天气不错                 |
+--------+--------+-------------------+------------------------------+
3 rows in set (0.75 sec)

上一篇:

玩转StarRocks on ES-1-分词


本文转载自: https://blog.csdn.net/u013836748/article/details/122634509
版权归原作者 八寅老爷 所有, 如有侵权,请联系我们删除。

“玩转 StarRocks on ES 全文检索”的评论:

还没有评论