一、测试环境

名称
cpu12th Gen Intel® Core™ i7-12700H
操作系统CentOS Linux release 7.9.2009 (Core)
内存3G
逻辑核数2
Gbase-8a数据库版本9.5.3.27
Gbase-8a-fulltext版本9.5.3.27.6

二、全文检索-安装步骤

1、解压

[gbase@czg0 pkg]$ tar -xvf GBase8a_MPP_Cluster-NoLicense-

[gbase@czg0 pkg]$ cd gcinstall_fulltext/

[gbase@czg0 gcinstall_fulltext]$ ll
总用量 22800
-rw-r--r-- 1 gbase gbase      416 2月  15 2022 BUILDINFO
-rw-r--r-- 1 gbase gbase     2345 2月  15 2022 CGConfigChecker.py
-rwxr-xr-x 1 gbase gbase     3759 215 2022 chkLicense.py
-rwxr-xr-x 1 gbase gbase     5214 215 2022 CorosyncConf.py
-rw-r--r-- 1 gbase gbase      466 2月  15 2022 demo.options
-rw-r--r-- 1 gbase gbase      170 2月  15 2022 dependRpms
-rw-r--r-- 1 gbase gbase      684 2月  15 2022 example.xml
-rwxr-xr-x 1 gbase gbase      464 215 2022 extendCfg.xml
-rw-r--r-- 1 gbase gbase      781 2月  15 2022 FileCheck.py
-rw-r--r-- 1 gbase gbase     2541 2月  15 2022 fulltext.py
-rw-r--r-- 1 gbase gbase 17811691 2月  15 2022 fulltext.tar.bz2
-rw-r--r-- 1 gbase gbase  4818440 2月  15 2022 gbase_data_timezone.sql
-rwxr-xr-x 1 gbase gbase     4264 215 2022 gccopy.py
-rwxr-xr-x 1 gbase gbase     4462 215 2022 gcexec.py
-rwxr-xr-x 1 gbase gbase     8131 215 2022 gcinstall_fulltext.py
-rw-r--r-- 1 gbase gbase      294 2月  15 2022 gcwareGroup.json
-rwxr-xr-x 1 gbase gbase     5186 215 2022 getesn.py
-rwxr-xr-x 1 gbase gbase     3689 215 2022 GetOSType.py
-rw-r--r-- 1 gbase gbase   230902 2月  15 2022 InstallFuns.py
drwxr-xr-x 2 gbase gbase       49 215 2022 installLicense
-rw-r--r-- 1 gbase gbase   135410 2月  15 2022 InstallTar.py
-rw-r--r-- 1 gbase gbase     1114 2月  15 2022 license.txt
-rwxr-xr-x 1 gbase gbase    75990 215 2022 pexpect.py
-rwxr-xr-x 1 gbase gbase    24258 215 2022 replaceStop.py
-rw-r--r-- 1 gbase gbase     6415 2月  15 2022 RestoreLocal.py
-rw-r--r-- 1 gbase gbase     8777 2月  15 2022 rmt.py
-rw-r--r-- 1 gbase gbase      299 2月  15 2022 rootPwd.json
-rwxr-xr-x 1 gbase gbase    29423 215 2022 SetSysEnv.py
-rw-r--r-- 1 gbase gbase     2615 2月  15 2022 SSHThread.py
-rwxr-xr-x 1 gbase gbase     6937 215 2022 unInstall_fulltext.py
-rw-r--r-- 1 gbase gbase    77247 2月  15 2022 UpdateSysTable.json

2、配置demo.options

[gbase@czg0 gcinstall_fulltext]$ cat demo.options 
installPrefix= /opt
coordinateHost = 192.168.142.10,192.168.142.11,192.168.142.12
coordinateHostNodeID = 234,235,237
dataHost = 192.168.142.10,192.168.142.11,192.168.142.12
#existCoordinateHost =
#existDataHost =
#existGcwareHost=
gcwareHost = 192.168.142.10,192.168.142.11,192.168.142.12
gcwareHostNodeID = 234,235,237
dbaUser = gbase
dbaGroup = gbase
dbaPwd = 'gbase'
rootPwd = 'qwer1234'
#dbRootPwd = ''
#rootPwdFile = rootPwd.json
#characterSet = utf8
#sshPort = 22

3、停服务

每个节点执行

[gbase@czg2 ~]$ gcluster_services all stop
Stopping gcrecover :                                       [  OK  ]
Stopping gcluster :                                        [  OK  ]
Stopping gbase :                                           [  OK  ]
Stopping syncserver :                                      [  OK  ]

[gbase@czg0 gcinstall_fulltext]$ gcware_services all stop
Stopping GCWareMonit success!
Stopping gcware :                                          [  OK  ]

4、安装

[gbase@czg0 gcinstall_fulltext]$ ./gcinstall_fulltext.py --dbaUserPwd=gbase
CoordinateHost:
192.168.142.10    192.168.142.11    192.168.142.12
DataHost:
192.168.142.11    192.168.142.10    192.168.142.12
Are you sure to install fulltext on these gcluster nodes. ([Y,y]/[N,n])? y
192.168.142.12  Install fulltext successfully.
192.168.142.11  Install fulltext successfully.
192.168.142.10  Install fulltext successfully.

5、启服务

[gbase@czg0 gcinstall_fulltext]$ gcluster_services all start
Starting gbase :                                           [  OK  ]
Starting syncserver :                                      [  OK  ]
Starting gcluster :                                        [  OK  ]
Starting gcrecover :                                       [  OK  ]

[gbase@czg0 gcinstall_fulltext]$ gcware_services all start
Starting gcware :                                          [  OK  ]
Starting GCWareMonit success!

三、测试数据生成

gbase> CREATE TABLE TB_LIKE (A INT,B VARCHAR(100)); 
Query OK, 0 rows affected, 1 warning (Elapsed: 00:00:00.19)

gbase> drop procedure if exists "GenerateTestData";
Query OK, 0 rows affected, 1 warning (Elapsed: 00:00:00.21)

gbase> DELIMITER //
gbase> CREATE PROCEDURE "GenerateTestData"(num int)
    -> begin
    ->     declare tempval int;
    ->     
    ->     set tempval = 1;
    ->     set autocommit = off;
    ->     label: loop
    ->         insert into TB_LIKE  values(tempval,tempval||'太阳光'||tempval);
    ->         if tempval >= num then 
    ->             leave label;
    ->         else
    ->             set tempval = tempval + 1;
    ->         end if;
    ->     end loop label;
    ->     commit;
    -> end;//
Query OK, 0 rows affected, 1 warning (Elapsed: 00:00:00.02)

gbase>  call "GenerateTestData"(1000000);//                                                                  
Query OK, 0 rows affected (Elapsed: 00:00:43.49)

gbase> select count(*) from TB_LIKE;//
+----------+
| count(*) |
+----------+
|  1000000 |
+----------+
1 row in set (Elapsed: 00:00:00.11)

gbase> select * from TB_LIKE limit 10;//       
+------+---------------+
| A    | B             |
+------+---------------+
|    1 | 1太阳光1      |
|    2 | 2太阳光2      |
|    3 | 3太阳光3      |
|    4 | 4太阳光4      |
|    5 | 5太阳光5      |
|    6 | 6太阳光6      |
|    7 | 7太阳光7      |
|    8 | 8太阳光8      |
|    9 | 9太阳光9      |
|   10 | 10太阳光10    |
+------+---------------+
10 rows in set (Elapsed: 00:00:00.01)

gbase> insert into tb_like values(1,'我是小太太啦啦啦阳'); 
Query OK, 1 row affected (Elapsed: 00:00:00.00)

gbase> commit;                                            
Query OK, 0 rows affected (Elapsed: 00:00:00.17)

gbase> create fulltext index ft_index on tb_like(b);  
Query OK, 0 rows affected, 1 warning (Elapsed: 00:00:00.15)
Records: 0  Duplicates: 0  Warnings: 0

gbase> update index ft_index on tb_like;              
Query OK, 1000001 rows affected, 1 warning (Elapsed: 00:00:17.72)

四、实验

参考链接:GBase 8a全文索引提高模糊查询性能的使用样例

1、双引号精确匹配

gbase> select count(*) from tb_like where contains(b,'太阳');      
+----------+
| count(*) |
+----------+
|  1000001 |
+----------+
1 row in set (Elapsed: 00:00:00.86)

没有加双引号,是分开匹配的,只要包含这两个字就可以,所以把’我是小太太啦啦啦阳’也匹配出来。

gbase> select count(*) from tb_like where contains(b,'"太阳"');
+----------+
| count(*) |
+----------+
|  1000000 |
+----------+
1 row in set (Elapsed: 00:00:00.21)

我们加上双引号,太阳是一个组合,不能分开匹配,所以过滤掉了’我是小太太啦啦啦阳’。

2、等同于左右百分号的like的查询

(1)测试中文
gbase> select count(*) from tb_like where contains(b,'啦');    
+----------+
| count(*) |
+----------+
|        1 |
+----------+
1 row in set (Elapsed: 00:00:00.01)

gbase> select count(*) from tb_like where contains(b,'啦阳');  
+----------+
| count(*) |
+----------+
|        1 |
+----------+
1 row in set (Elapsed: 00:00:00.04)
(2)测试英文*
gbase> insert into tb_like values(1,'My Name Is Sun');         
Query OK, 1 row affected (Elapsed: 00:00:00.00)

gbase> update index ft_index on tb_like;    
Query OK, 1 row affected, 1 warning (Elapsed: 00:00:01.20)

加入测试数据

gbase> select  * from tb_like where contains(b,'"Is Sun"');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.01)

gbase> select  * from tb_like where contains(b,'"IS Sun"');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.02)

第二个IS是大写,发现也能匹配上,可能内部是全部转换成大写或小写,再去匹配的。

gbase> select * from czg.tb_like where b like '%s%';   
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.10)

gbase> select * from czg.tb_like where contains(b,'s');
Empty set (Elapsed: 00:00:00.03)

gbase> select * from czg.tb_like where contains(b,'Is');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.03)

英文字母一个,全文检索不出来。
英文字母两个以上,测试正常。

(3)测试数字**
gbase> select  * from tb_like where contains(b,'1');   
+------+-------------+
| A    | B           |
+------+-------------+
|    1 | 1太阳光1    |
+------+-------------+
1 row in set (Elapsed: 00:00:00.01)

gbase> select  count(*) from tb_like where contains(b,'1');
+----------+
| count(*) |
+----------+
|        1 |
+----------+
1 row in set (Elapsed: 00:00:00.02)

gbase> select  count(*) from tb_like where b like'%1%';    
+----------+
| count(*) |
+----------+
|   468560 |
+----------+
1 row in set (Elapsed: 00:00:00.04)

一个数字不对,没有全部匹配出来

gbase> select  count(*) from tb_like where b like'%1111%';
+----------+
| count(*) |
+----------+
|      280 |
+----------+
1 row in set (Elapsed: 00:00:00.06)

gbase> select  count(*) from tb_like where contains(b,'1111');
+----------+
| count(*) |
+----------+
|        1 |
+----------+
1 row in set (Elapsed: 00:00:00.02)

四个数字不对,没有全部匹配出来

3、开头标记^,等同右百分号的like查询

gbase> select * from czg.tb_like where b like 'My%';    
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.04)

gbase> select * from czg.tb_like where contains(b,'^My');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.04)

4、结尾标记$,等同左百分号的like查询

gbase> select * from czg.tb_like where b like '%Sun';    
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.07)

gbase> select * from czg.tb_like where contains(b,'Sun$');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.03)

5、无双引号的空格属于隐式AND

gbase> select * from czg.tb_like where contains(b,'My Sun');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.03)

gbase> select * from czg.tb_like where contains(b,'"My Sun"');
Empty set (Elapsed: 00:00:00.03)

6、“与”AND标记&, 多个like and

gbase> select * from czg.tb_like where contains(b,'"My"&"Sun"&"Name"');
+------+----------------+
| A    | B              |
+------+----------------+
|    1 | My Name Is Sun |
+------+----------------+
1 row in set (Elapsed: 00:00:00.03)

gbase> select * from czg.tb_like where contains(b,'"My"&"Sun"&"Name"&"太阳"');
Empty set (Elapsed: 00:00:00.18)

7、“或”OR标记|,多个like OR

gbase> insert into tb_like values(1,'我是小月亮');         
Query OK, 1 row affected (Elapsed: 00:00:00.00)

gbase> commit;
Query OK, 0 rows affected (Elapsed: 00:00:00.07)

gbase> update index ft_index on tb_like;           
Query OK, 1 row affected, 1 warning (Elapsed: 00:00:01.13)

gbase> commit;
Query OK, 0 rows affected (Elapsed: 00:00:00.01)

测试数据生成。

gbase> select * from tb_like where contains(b,'"小太太"|"小月亮"')
    -> ;
+------+-----------------------------+
| A    | B                           |
+------+-----------------------------+
|    1 | 我是小太太啦啦啦阳          |
|    1 | 我是小月亮                  |
+------+-----------------------------+
2 rows in set (Elapsed: 00:00:00.60)

8、”非“NOT标记-, not like

包含”太“或”小月亮“, 但不包含”太阳“的。
默认优先级 或 > 非 > 与。

gbase> select * from tb_like where contains(b,'"太"|"小月亮"-"太阳"');
+------+-----------------------------+
| A    | B                           |
+------+-----------------------------+
|    1 | 我是小太太啦啦啦阳          |
|    1 | 我是小月亮                  |
+------+-----------------------------+
2 rows in set (Elapsed: 00:00:00.17)

9、使用小括号分组来改变匹配顺序

包含“太太”或包含“小“但不能包含“太阳”,把括号来改变执行顺序。

gbase> select * from tb_like where contains(b,'"太太"|("小"-"太阳")');
+------+-----------------------------+
| A    | B                           |
+------+-----------------------------+
|    1 | 我是小太太啦啦啦阳          |
|    1 | 我是小月亮                  |
+------+-----------------------------+
2 rows in set (Elapsed: 00:00:00.18)

10、阀值匹配符‘/’

三个字中满足两个,返回结果。
注意:是单个字,而不是词,如果写词,也会被拆分开分字。

gbase> select * from tb_like where contains(b,'"小 阳 亮"/2',1);          
+------+-----------------------------+
| A    | B                           |
+------+-----------------------------+
|    1 | 我是小太太啦啦啦阳          |
|    1 | 我是小月亮                  |
+------+-----------------------------+
2 rows in set (Elapsed: 00:00:00.08)

三个字中满足三个,返回结果。

gbase> select * from tb_like where contains(b,'"小 阳 亮"/3',1);
Empty set (Elapsed: 00:00:00.04)
Logo

开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!

更多推荐