ik分词器采用MySQL热更新

2023/8/18 23:22:56

本文主要是介绍ik分词器采用MySQL热更新,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

ik分词器采用MySQL热更新

​ 官方所给的IK分词器只支持远程文本文件热更新,不支持采用MySQL热更新,没关系,这难不倒伟大的博主,给哈哈哈。今天就来和大家讲一下如何采用MySQL做热更新IK分词器的词库。

一、建立数据库表

CREATE TABLE `es_extra_main`
(
    `id`          int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
    `word`        varchar(255) CHARACTER SET utf8mb4 NOT NULL COMMENT '词',
    `is_deleted`  tinyint(1) NOT NULL DEFAULT '0' COMMENT '是否已删除',
    `update_time` timestamp(6)                       NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP (6) COMMENT '更新时间',
    PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;


CREATE TABLE `es_extra_stopword`
(
    `id`          int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
    `word`        varchar(255) CHARACTER SET utf8mb4 NOT NULL COMMENT '词',
    `is_deleted`  tinyint(1) NOT NULL DEFAULT '0' COMMENT '是否已删除',
    `update_time` timestamp(6)                       NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP (6) COMMENT '更新时间',
    PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

二、修改IK分词器插件源码

2. 1修改pom文件

<!--mysql驱动-->
<dependency>
	<groupId>mysql</groupId>
	<artifactId>mysql-connector-java</artifactId>
	<version>8.0.29</version>
</dependency>

2.2 新增DatabaseMonitor类

这里新增一个关于MySQL的类,源码中有关于远程文本文件的热更新源码,我们这边仿照源码来写一就可以啦。

package org.wltea.analyzer.dic;

import org.apache.logging.log4j.Logger;
import org.elasticsearch.SpecialPermission;
import org.wltea.analyzer.help.ESPluginLoggerFactory;

import java.security.AccessController;
import java.security.PrivilegedAction;
import java.sql.*;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;

public class DatabaseMonitor implements Runnable {

    private static final Logger logger = ESPluginLoggerFactory.getLogger(DatabaseMonitor.class.getName());
    public static final String PATH_JDBC_PROPERTIES = "jdbc.properties";

    private static final String JDBC_URL = "jdbc.url";
    private static final String JDBC_USERNAME = "jdbc.username";
    private static final String JDBC_PASSWORD = "jdbc.password";
    private static final String JDBC_DRIVER = "jdbc.driver";
    private static final String SQL_UPDATE_MAIN_DIC = "jdbc.update.main.dic.sql";
    private static final String SQL_UPDATE_STOPWORD = "jdbc.update.stopword.sql";

    /**
     * 更新间隔
     */
    public final static String JDBC_UPDATE_INTERVAL = "jdbc.update.interval";

    private static final Timestamp DEFAULT_LAST_UPDATE = Timestamp.valueOf(LocalDateTime.of(LocalDate.of(2020, 1, 1), LocalTime.MIN));

    private static Timestamp lastUpdateTimeOfMainDic = null;

    private static Timestamp lastUpdateTimeOfStopword = null;

    public String getUrl() {
        return Dictionary.getSingleton().getProperty(JDBC_URL);
    }

    public String getUsername() {
        return Dictionary.getSingleton().getProperty(JDBC_USERNAME);
    }

    public String getPassword() {
        return Dictionary.getSingleton().getProperty(JDBC_PASSWORD);
    }

    public String getDriver() {
        return Dictionary.getSingleton().getProperty(JDBC_DRIVER);
    }

    public String getUpdateMainDicSql() {
        return Dictionary.getSingleton().getProperty(SQL_UPDATE_MAIN_DIC);
    }

    public String getUpdateStopwordSql() {
        return Dictionary.getSingleton().getProperty(SQL_UPDATE_STOPWORD);
    }

    /**
     * 加载MySQL驱动
     */
    public DatabaseMonitor() {
        SpecialPermission.check();
        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            try {
                Class.forName(getDriver());
            } catch (ClassNotFoundException e) {
                logger.error("mysql jdbc driver not found", e);
            }
            return null;
        });


    }

    @Override
    public void run() {
        SpecialPermission.check();
        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            Connection conn = getConnection();

            // 更新主词典
            updateMainDic(conn);
            // 更新停用词
            updateStopword(conn);
            closeConnection(conn);

            return null;
        });

    }

    public Connection getConnection() {
        Connection connection = null;
        try {
            connection = DriverManager.getConnection(getUrl(), getUsername(), getPassword());
        } catch (SQLException e) {
            logger.error("failed to get connection", e);
        }
        return connection;
    }

    public void closeConnection(Connection conn) {
        if (conn != null) {
            try {
                conn.close();
            } catch (SQLException e) {
                logger.error("failed to close Connection", e);
            }
        }
    }

    public void closeRsAndPs(ResultSet rs, PreparedStatement ps) {
        if (rs != null) {
            try {
                rs.close();
            } catch (SQLException e) {
                logger.error("failed to close ResultSet", e);
            }
        }

        if (ps != null) {
            try {
                ps.close();
            } catch (SQLException e) {
                logger.error("failed to close PreparedStatement", e);
            }
        }

    }

    /**
     * 主词典
     */
    public synchronized void updateMainDic(Connection conn) {

        logger.info("start update main dic");
        int numberOfAddWords = 0;
        int numberOfDisableWords = 0;
        PreparedStatement ps = null;
        ResultSet rs = null;

        try {
            String sql = getUpdateMainDicSql();

            Timestamp param = lastUpdateTimeOfMainDic == null ? DEFAULT_LAST_UPDATE : lastUpdateTimeOfMainDic;

            logger.info("param: " + param);

            ps = conn.prepareStatement(sql);
            ps.setTimestamp(1, param);

            rs = ps.executeQuery();

            while (rs.next()) {
                String word = rs.getString("word");
                word = word.trim();

                if (word.isEmpty()) {
                    continue;
                }

                lastUpdateTimeOfMainDic = rs.getTimestamp("update_time");

                if (rs.getBoolean("is_deleted")) {
                    logger.info("[main dic] disable word: {}", word);
                    // 删除
                    Dictionary.disableWord(word);
                    numberOfDisableWords++;
                } else {
                    logger.info("[main dic] add word: {}", word);
                    // 添加
                    Dictionary.addWord(word);
                    numberOfAddWords++;
                }
            }

            logger.info("end update main dic -> addWord: {}, disableWord: {}", numberOfAddWords, numberOfDisableWords);

        } catch (SQLException e) {
            logger.error("failed to update main_dic", e);
            // 关闭 ResultSet、PreparedStatement
            closeRsAndPs(rs, ps);
        }
    }

    /**
     * 停用词
     */
    public synchronized void updateStopword(Connection conn) {

        logger.info("start update stopword");

        int numberOfAddWords = 0;
        int numberOfDisableWords = 0;
        PreparedStatement ps = null;
        ResultSet rs = null;
        try {
            String sql = getUpdateStopwordSql();

            Timestamp param = lastUpdateTimeOfStopword == null ? DEFAULT_LAST_UPDATE : lastUpdateTimeOfStopword;

            logger.info("param: " + param);

            ps = conn.prepareStatement(sql);
            ps.setTimestamp(1, param);

            rs = ps.executeQuery();

            while (rs.next()) {
                String word = rs.getString("word");
                word = word.trim();


                if (word.isEmpty()) {
                    continue;
                }

                lastUpdateTimeOfStopword = rs.getTimestamp("update_time");

                if (rs.getBoolean("is_deleted")) {
                    logger.info("[stopword] disable word: {}", word);

                    // 删除
                    Dictionary.disableStopword(word);
                    numberOfDisableWords++;
                } else {
                    logger.info("[stopword] add word: {}", word);
                    // 添加
                    Dictionary.addStopword(word);
                    numberOfAddWords++;
                }
            }

            logger.info("end update stopword -> addWord: {}, disableWord: {}", numberOfAddWords, numberOfDisableWords);

        } catch (SQLException e) {
            logger.error("failed to update main_dic", e);
        } finally {
            // 关闭 ResultSet、PreparedStatement
            closeRsAndPs(rs, ps);
        }
    }
}



2.3 修改代码

初始化方法中新增加载JDBC的方法和将getProperty改为public

1691978293978

并且在Dictionary类后面新增下面的方法

	/**
	 * 加载新词条
	 */
	public static void addWord(String word) {
		singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 移除(屏蔽)词条
	 */
	public static void disableWord(String word) {
		singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 加载新停用词
	 */
	public static void addStopword(String word) {
		singleton._StopWords.fillSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 移除(屏蔽)停用词
	 */
	public static void disableStopword(String word) {
		singleton._StopWords.disableSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 加载 jdbc.properties
	 */
	public void loadJdbcProperties() {
		Path file = PathUtils.get(getDictRoot(), DatabaseMonitor.PATH_JDBC_PROPERTIES);
		try {
			props.load(new FileInputStream(file.toFile()));
			logger.info("====================================properties====================================");
			for (Map.Entry<Object, Object> entry : props.entrySet()) {
				logger.info("{}: {}", entry.getKey(), entry.getValue());
			}
			logger.info("====================================properties====================================");
		} catch (IOException e) {
			logger.error("failed to read file: " + DatabaseMonitor.PATH_JDBC_PROPERTIES, e);
		}
	}



三、修改插件的权限

1691978533490

grant {
  // needed because of the hot reload functionality
  permission java.net.SocketPermission "*", "connect,resolve";
  permission java.lang.RuntimePermission "setContextClassLoader";
};

四、打包

4.1 加入依赖

将MySQL的jar包依赖加入进来,否则打包会缺少jar包保持错。

1691978585532

<include>mysql:mysql-connector-java</include>

4.2 package

打包成zip文件,然后加压成文件夹

1691978891388

五、安装

将解压的文件夹放到ES的plugins目录下,然后配置一下config目录下的数据库配置信息,最后再重启一下ES即可完成安装。

1691982824263

六、测试验证

在数据库表中中新增下面自己的想要的关键词,然后去Kibana中做测试验证,可以发现已经可以啦。

关键词

1691983160572

停止词

image-20230814142403389

POST _analyze
{
  "text": ["俺是熊二呗"], 
  "analyzer": "ik_max_word"
}

运行结果

1691983228927

运行结果



这篇关于ik分词器采用MySQL热更新的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!


扫一扫关注最新编程教程