diff --git a/common/liboceanbase_jni_common/CMakeLists.txt b/common/liboceanbase_jni_common/CMakeLists.txt new file mode 100644 index 0000000..00b5701 --- /dev/null +++ b/common/liboceanbase_jni_common/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.10) +project(oceanbase_jni_common LANGUAGES CXX) + +# Find JNI +FIND_PACKAGE(JNI REQUIRED COMPONENTS JVM) + +# Add library +ADD_LIBRARY(${PROJECT_NAME} SHARED jni_manager.cpp) + +# Include directories +TARGET_INCLUDE_DIRECTORIES(${PROJECT_NAME} PUBLIC + $ + $ + ${JNI_INCLUDE_DIRS} +) + +# Link JNI +TARGET_LINK_LIBRARIES(${PROJECT_NAME} PRIVATE ${JNI_LIBRARIES}) + +# Set C++ standard +SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_VISIBILITY_PRESET default +) + +# Install +install(FILES jni_manager.h DESTINATION include) +install(TARGETS ${PROJECT_NAME} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin +) + diff --git a/common/liboceanbase_jni_common/README.en-US.md b/common/liboceanbase_jni_common/README.en-US.md new file mode 100644 index 0000000..7009921 --- /dev/null +++ b/common/liboceanbase_jni_common/README.en-US.md @@ -0,0 +1,105 @@ +# OceanBase JNI Common Management Library + +A common JNI management library for OceanBase multi-language tokenization plugins, providing unified JVM management and thread-safe JNI environment. + +## Features + +- ✅ Singleton JVM Management: Multiple plugins share the same JVM instance +- ✅ Thread Safety: Automatic JNI thread attach/detach handling +- ✅ Reference Counting: Smart JNI environment lifecycle management +- ✅ RAII Pattern: Automatic resource management, prevents memory leaks + +## Core Components + +### GlobalJVMManager +```cpp +// Global singleton JVM manager +GlobalJVMManager& jvm_manager = GlobalJVMManager::getInstance(); +JavaVM* jvm = jvm_manager.getJVM(); +``` + +### GlobalThreadManager +```cpp +// Thread-safe JNI environment management +GlobalThreadManager& thread_manager = GlobalThreadManager::getInstance(); +JNIEnv* env = thread_manager.attachCurrentThread(); +thread_manager.detachCurrentThread(); +``` + +### ScopedJNIEnvironment +```cpp +// RAII automatic JNI environment management +{ + ScopedJNIEnvironment scoped_env; + JNIEnv* env = scoped_env.getEnv(); + // Automatically cleaned up when scope ends +} +``` + +## Use Cases + +### Multi-Plugin Coexistence +- **Japanese Parser Plugin** (`libjapanese_ftparser.so`) +- **Korean Parser Plugin** (`libkorean_ftparser.so`) +- **Thai Parser Plugin** (`libthai_ftparser.so`) + +All plugins share the same JVM instance, avoiding conflicts. + +### Thread Safety +Automatically handles JNI calls in multi-threaded environments: +```cpp +// Thread A +ScopedJNIEnvironment env_a; // Auto attach +// Use env_a.getEnv() + +// Thread B +ScopedJNIEnvironment env_b; // Auto attach +// Use env_b.getEnv() +// Threads don't interfere with each other +``` + +## Build + +```bash +cd common/liboceanbase_jni_common +mkdir build && cd build +cmake .. +make +``` + +Generates `liboceanbase_jni_common.so` shared library. + +## Deployment + +```bash +# Copy to Observer plugin directory +cp liboceanbase_jni_common.so /path/to/observer/plugin_dir/ + +# Ensure all tokenization plugins can find this library +export LD_LIBRARY_PATH=/path/to/observer/plugin_dir:$LD_LIBRARY_PATH +``` + +## Technical Advantages + +### Core Problems Solved +1. **JVM Conflicts**: Conflicts caused by multiple plugins trying to create JVM +2. **Memory Leaks**: Improper JNI environment release +3. **Thread Safety**: JNI call issues in multi-threaded environments +4. **Resource Management**: Complex JNI lifecycle management + +### Design Patterns +- **Singleton Pattern**: Ensures globally unique JVM +- **RAII Pattern**: Automatic resource management +- **Reference Counting**: Smart pointer-style environment management +- **Thread Local Storage**: Per-thread independent JNI environment + +## Dependencies + +``` +liboceanbase_jni_common.so +├── libjapanese_ftparser.so +├── libkorean_ftparser.so +└── libthai_ftparser.so +``` + +**Providing stable and reliable JNI infrastructure for OceanBase multi-language fulltext search** diff --git a/common/liboceanbase_jni_common/README.zh-CN.md b/common/liboceanbase_jni_common/README.zh-CN.md new file mode 100644 index 0000000..02c0490 --- /dev/null +++ b/common/liboceanbase_jni_common/README.zh-CN.md @@ -0,0 +1,105 @@ +# OceanBase JNI 公共管理库 + +OceanBase 多语言分词插件的公共 JNI 管理库,提供统一的 JVM 管理和线程安全的 JNI 环境。 + +## 功能特性 + +- ✅ 单例 JVM 管理:多个插件共享同一个 JVM 实例 +- ✅ 线程安全:自动处理 JNI 线程附加/分离 +- ✅ 引用计数:智能管理 JNI 环境的生命周期 +- ✅ RAII 模式:自动资源管理,防止内存泄漏 + +## 核心组件 + +### GlobalJVMManager +```cpp +// 全局单例 JVM 管理器 +GlobalJVMManager& jvm_manager = GlobalJVMManager::getInstance(); +JavaVM* jvm = jvm_manager.getJVM(); +``` + +### GlobalThreadManager +```cpp +// 线程安全的 JNI 环境管理 +GlobalThreadManager& thread_manager = GlobalThreadManager::getInstance(); +JNIEnv* env = thread_manager.attachCurrentThread(); +thread_manager.detachCurrentThread(); +``` + +### ScopedJNIEnvironment +```cpp +// RAII 自动管理 JNI 环境 +{ + ScopedJNIEnvironment scoped_env; + JNIEnv* env = scoped_env.getEnv(); + // 自动在作用域结束时清理 +} +``` + +## 使用场景 + +### 多插件共存 +- **日语分词插件** (`libjapanese_ftparser.so`) +- **韩语分词插件** (`libkorean_ftparser.so`) +- **泰语分词插件** (`libthai_ftparser.so`) + +所有插件共享同一个 JVM 实例,避免冲突。 + +### 线程安全 +自动处理多线程环境下的 JNI 调用: +```cpp +// 线程 A +ScopedJNIEnvironment env_a; // 自动附加 +// 使用 env_a.getEnv() + +// 线程 B +ScopedJNIEnvironment env_b; // 自动附加 +// 使用 env_b.getEnv() +// 两个线程互不干扰 +``` + +## 编译 + +```bash +cd common/liboceanbase_jni_common +mkdir build && cd build +cmake .. +make +``` + +生成 `liboceanbase_jni_common.so` 共享库。 + +## 部署 + +```bash +# 复制到 Observer 插件目录 +cp liboceanbase_jni_common.so /path/to/observer/plugin_dir/ + +# 确保所有分词插件都能找到此库 +export LD_LIBRARY_PATH=/path/to/observer/plugin_dir:$LD_LIBRARY_PATH +``` + +## 技术优势 + +### 解决的核心问题 +1. **JVM 冲突**:多个插件试图创建 JVM 导致的冲突 +2. **内存泄漏**:JNI 环境未正确释放 +3. **线程安全**:多线程环境下的 JNI 调用问题 +4. **资源管理**:复杂的 JNI 生命周期管理 + +### 设计模式 +- **单例模式**:确保 JVM 全局唯一 +- **RAII 模式**:自动资源管理 +- **引用计数**:智能指针式的环境管理 +- **线程局部存储**:每线程独立的 JNI 环境 + +## 依赖关系 + +``` +liboceanbase_jni_common.so +├── libjapanese_ftparser.so +├── libkorean_ftparser.so +└── libthai_ftparser.so +``` + +**为 OceanBase 多语言全文检索提供稳定可靠的 JNI 基础设施** diff --git a/common/liboceanbase_jni_common/jni_manager.cpp b/common/liboceanbase_jni_common/jni_manager.cpp new file mode 100644 index 0000000..2abdc9f --- /dev/null +++ b/common/liboceanbase_jni_common/jni_manager.cpp @@ -0,0 +1,519 @@ +/** + * Copyright (c) 2023 OceanBase + * OceanBase JNI Common Library - Implementation + */ + +#include "jni_manager.h" +#include +#include +#include +#include +#include + +// Use OceanBase plugin logging framework +#include "oceanbase/ob_plugin_log.h" + +// Define ERROR level logging since it's not provided by OceanBase +#define OBP_LOG_ERROR(fmt, ...) \ + do { \ + printf("[ERROR][JNI_COMMON] " fmt "\n", ##__VA_ARGS__); \ + fflush(stdout); \ + } while(0) + +namespace oceanbase { +namespace jni { + +// JNIConfigUtils implementation +std::string JNIConfigUtils::build_dynamic_classpath(const std::string& base_dir) { + std::string lib_dir = base_dir + "/lib"; + std::vector jar_files; + + // Open directory + DIR* dir = opendir(lib_dir.c_str()); + if (dir == nullptr) { + // Directory doesn't exist, return fallback classpath + return base_dir + "/lib/lucene-core-8.11.2.jar:" + + base_dir + "/lib/lucene-analyzers-common-8.11.2.jar:" + + base_dir + "/lib/lucene-analyzers-kuromoji-8.11.2.jar:" + + base_dir + "/lib/lucene-analyzers-nori-8.11.2.jar:" + + ":" + base_dir; + } + + // Read directory entries + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + std::string filename = entry->d_name; + + // Check if it's a .jar file + if (filename.length() > 4 && + filename.substr(filename.length() - 4) == ".jar") { + jar_files.push_back(lib_dir + "/" + filename); + } + } + closedir(dir); + + // Sort jar files for consistent ordering + std::sort(jar_files.begin(), jar_files.end()); + + // Build classpath + std::string classpath; + for (const auto& jar : jar_files) { + if (!classpath.empty()) { + classpath += ":"; + } + classpath += jar; + } + + // Add base directory for .class files + if (!classpath.empty()) { + classpath += ":"; + } + classpath += base_dir; + + return classpath; +} + +std::string JNIConfigUtils::get_unified_classpath() { + // Check global environment variable first + const char* env_classpath = std::getenv("OCEANBASE_JNI_CLASSPATH"); + if (env_classpath && strlen(env_classpath) > 0) { + return std::string(env_classpath); + } + + // Use dynamic classpath building + return build_dynamic_classpath("./java"); +} + +size_t JNIConfigUtils::get_unified_max_heap_mb() { + const char* env_max_heap = std::getenv("OCEANBASE_JNI_MAX_HEAP"); + if (env_max_heap && strlen(env_max_heap) > 0) { + return static_cast(std::atoi(env_max_heap)); + } + return 512; // Unified default: 512MB +} + +size_t JNIConfigUtils::get_unified_init_heap_mb() { + const char* env_init_heap = std::getenv("OCEANBASE_JNI_INIT_HEAP"); + if (env_init_heap && strlen(env_init_heap) > 0) { + return static_cast(std::atoi(env_init_heap)); + } + return 128; // Unified default: 128MB +} + +// GlobalJVMManager static members +std::mutex GlobalJVMManager::global_mutex_; +JavaVM* GlobalJVMManager::shared_jvm_ = nullptr; +std::atomic GlobalJVMManager::plugin_count_{0}; +bool GlobalJVMManager::jvm_created_by_us_ = false; +std::unordered_set GlobalJVMManager::registered_plugins_; +std::string GlobalJVMManager::first_instance_classpath_; +size_t GlobalJVMManager::first_instance_max_heap_mb_ = 0; +size_t GlobalJVMManager::first_instance_init_heap_mb_ = 0; +bool GlobalJVMManager::config_recorded_ = false; + +// GlobalThreadManager static members +std::mutex GlobalThreadManager::thread_mutex_; +std::unordered_map GlobalThreadManager::global_thread_ref_count_; +std::unordered_set GlobalThreadManager::attached_threads_; +// TODO: thread_plugin_map_ is currently not utilized, kept for future debugging/monitoring needs +// std::unordered_map> GlobalThreadManager::thread_plugin_map_; + +JavaVM* GlobalJVMManager::get_or_create_jvm(const std::string& classpath, + size_t max_heap_mb, + size_t init_heap_mb) { + std::lock_guard lock(global_mutex_); + + // Validate configuration consistency (with warnings if mismatch) + validate_config_consistency(classpath, max_heap_mb, init_heap_mb); + + // If JVM already exists, return it + if (shared_jvm_) { + OBP_LOG_INFO("Using existing global JVM instance"); + return shared_jvm_; + } + + // Try to get existing JVM from the process + jsize jvm_count = 0; + jint result = JNI_GetCreatedJavaVMs(&shared_jvm_, 1, &jvm_count); + + if (result == JNI_OK && jvm_count > 0) { + // Found existing JVM + OBP_LOG_INFO("Found existing JVM in process, reusing it"); + jvm_created_by_us_ = false; + return shared_jvm_; + } + + // Create new JVM + OBP_LOG_INFO("Creating new JVM with classpath: %s", classpath.c_str()); + + JavaVMInitArgs vm_args; + JavaVMOption options[5]; + + // Create persistent copies of option strings to avoid dangling pointers + static std::string classpath_option = "-Djava.class.path=" + classpath; + static std::string max_heap_option = "-Xmx" + std::to_string(max_heap_mb) + "m"; + static std::string init_heap_option = "-Xms" + std::to_string(init_heap_mb) + "m"; + + options[0].optionString = const_cast(classpath_option.c_str()); + options[1].optionString = const_cast(max_heap_option.c_str()); + options[2].optionString = const_cast(init_heap_option.c_str()); + options[3].optionString = const_cast("-XX:+UseG1GC"); + options[4].optionString = const_cast("-Dfile.encoding=UTF-8"); + + vm_args.version = JNI_VERSION_1_8; + vm_args.nOptions = 5; + vm_args.options = options; + vm_args.ignoreUnrecognized = JNI_FALSE; + + JNIEnv* env = nullptr; + result = JNI_CreateJavaVM(&shared_jvm_, (void**)&env, &vm_args); + + if (result == JNI_OK) { + jvm_created_by_us_ = true; + OBP_LOG_INFO("JVM created successfully"); + return shared_jvm_; + } else { + OBP_LOG_ERROR("Failed to create JVM, error code: %d", result); + shared_jvm_ = nullptr; + return nullptr; + } +} + +void GlobalJVMManager::register_plugin(const std::string& plugin_name) { + std::lock_guard lock(global_mutex_); + + if (registered_plugins_.insert(plugin_name).second) { + int count = ++plugin_count_; + OBP_LOG_INFO("Plugin '%s' registered, total count: %d", plugin_name.c_str(), count); + } else { + OBP_LOG_WARN("Plugin '%s' already registered", plugin_name.c_str()); + } +} + +void GlobalJVMManager::unregister_plugin(const std::string& plugin_name) { + std::lock_guard lock(global_mutex_); + + if (registered_plugins_.erase(plugin_name) > 0) { + int count = --plugin_count_; + OBP_LOG_INFO("Plugin '%s' unregistered, remaining count: %d", plugin_name.c_str(), count); + + if (count == 0) { + OBP_LOG_INFO("Last plugin unregistered, keeping JVM alive for stability"); + } + } else { + OBP_LOG_WARN("Plugin '%s' was not registered", plugin_name.c_str()); + } +} + +int GlobalJVMManager::get_plugin_count() { + return plugin_count_.load(); +} + +void GlobalJVMManager::force_shutdown_jvm() { + std::lock_guard lock(global_mutex_); + if (shared_jvm_ && jvm_created_by_us_) { + OBP_LOG_WARN("Force shutting down JVM"); + shared_jvm_->DestroyJavaVM(); + shared_jvm_ = nullptr; + jvm_created_by_us_ = false; + } +} + +JavaVM* GlobalJVMManager::get_jvm() { + std::lock_guard lock(global_mutex_); + return shared_jvm_; +} + +bool GlobalJVMManager::validate_config_consistency(const std::string& classpath, + size_t max_heap_mb, + size_t init_heap_mb) { + if (!config_recorded_) { + // First instance - record the configuration + first_instance_classpath_ = classpath; + first_instance_max_heap_mb_ = max_heap_mb; + first_instance_init_heap_mb_ = init_heap_mb; + config_recorded_ = true; + OBP_LOG_INFO("JVM configuration recorded: classpath=%s, max_heap=%zuMB, init_heap=%zuMB", + classpath.c_str(), max_heap_mb, init_heap_mb); + return true; + } + + // Subsequent instances - validate consistency + bool is_consistent = true; + + if (classpath != first_instance_classpath_) { + OBP_LOG_WARN("JVM classpath mismatch detected:"); + OBP_LOG_WARN(" First instance: %s", first_instance_classpath_.c_str()); + OBP_LOG_WARN(" Current instance: %s", classpath.c_str()); + is_consistent = false; + } + + if (max_heap_mb != first_instance_max_heap_mb_) { + OBP_LOG_WARN("JVM max heap size mismatch: first=%zuMB, current=%zuMB", + first_instance_max_heap_mb_, max_heap_mb); + is_consistent = false; + } + + if (init_heap_mb != first_instance_init_heap_mb_) { + OBP_LOG_WARN("JVM init heap size mismatch: first=%zuMB, current=%zuMB", + first_instance_init_heap_mb_, init_heap_mb); + is_consistent = false; + } + + return is_consistent; +} + +JNIEnv* GlobalThreadManager::acquire_jni_env_for_plugin(JavaVM* jvm, const std::string& plugin_name) { + if (!jvm) { + OBP_LOG_ERROR("JVM is null"); + return nullptr; + } + + std::lock_guard lock(thread_mutex_); + std::thread::id current_thread_id = std::this_thread::get_id(); + + JNIEnv* env = nullptr; + jint result = jvm->GetEnv((void**)&env, JNI_VERSION_1_8); + + if (result == JNI_OK) { + // Thread already attached, increase global reference count + global_thread_ref_count_[current_thread_id]++; + OBP_LOG_INFO("[%s] Thread %p already attached, global ref count: %d", + plugin_name.c_str(), ¤t_thread_id, global_thread_ref_count_[current_thread_id]); + return env; + } else if (result == JNI_EDETACHED) { + // Need to attach thread + result = jvm->AttachCurrentThread((void**)&env, nullptr); + if (result == JNI_OK) { + attached_threads_.insert(current_thread_id); + global_thread_ref_count_[current_thread_id] = 1; + OBP_LOG_INFO("[%s] Thread %p attached to JVM, global ref count: 1", + plugin_name.c_str(), ¤t_thread_id); + return env; + } else { + OBP_LOG_ERROR("[%s] Failed to attach thread %p to JVM, error: %d", + plugin_name.c_str(), ¤t_thread_id, result); + return nullptr; + } + } else { + OBP_LOG_ERROR("[%s] Unexpected JVM GetEnv result: %d", plugin_name.c_str(), result); + return nullptr; + } +} + +void GlobalThreadManager::release_jni_env_for_plugin(JavaVM* jvm, const std::string& plugin_name) { + if (!jvm) { + return; + } + + std::lock_guard lock(thread_mutex_); + std::thread::id current_thread_id = std::this_thread::get_id(); + + auto ref_it = global_thread_ref_count_.find(current_thread_id); + if (ref_it != global_thread_ref_count_.end()) { + ref_it->second--; + + OBP_LOG_INFO("[%s] Thread %p global ref count decreased to: %d", + plugin_name.c_str(), ¤t_thread_id, ref_it->second); + + if (ref_it->second <= 0) { + // Global reference count reached zero, detach thread + if (attached_threads_.count(current_thread_id) > 0) { + OBP_LOG_INFO("[%s] Thread %p detaching from JVM", plugin_name.c_str(), ¤t_thread_id); + jvm->DetachCurrentThread(); + attached_threads_.erase(current_thread_id); + } + global_thread_ref_count_.erase(ref_it); + } + } else { + OBP_LOG_WARN("[%s] Thread %p was not found in global reference count", + plugin_name.c_str(), ¤t_thread_id); + } +} + +int GlobalThreadManager::get_thread_ref_count(std::thread::id tid) { + std::lock_guard lock(thread_mutex_); + auto it = global_thread_ref_count_.find(tid); + return (it != global_thread_ref_count_.end()) ? it->second : 0; +} + +int GlobalThreadManager::get_attached_thread_count() { + std::lock_guard lock(thread_mutex_); + return static_cast(attached_threads_.size()); +} + +ScopedJNIEnvironment::ScopedJNIEnvironment(const std::string& plugin_name, + const std::string& classpath, + size_t max_heap_mb, + size_t init_heap_mb) + : env_(nullptr), plugin_name_(plugin_name), is_valid_(false) { + + OBP_LOG_INFO("[%s] ScopedJNIEnvironment constructor called", plugin_name.c_str()); + + JavaVM* jvm = nullptr; + + if (!classpath.empty()) { + // Use provided classpath (for backward compatibility) + OBP_LOG_INFO("[%s] Creating/getting JVM with provided classpath", plugin_name.c_str()); + jvm = GlobalJVMManager::get_or_create_jvm(classpath, max_heap_mb, init_heap_mb); + } else { + // Use unified configuration from JNIConfigUtils + OBP_LOG_INFO("[%s] Creating/getting JVM with unified configuration", plugin_name.c_str()); + jvm = GlobalJVMManager::get_or_create_jvm( + JNIConfigUtils::get_unified_classpath(), + JNIConfigUtils::get_unified_max_heap_mb(), + JNIConfigUtils::get_unified_init_heap_mb()); + } + + if (jvm) { + OBP_LOG_INFO("[%s] Acquiring JNI environment", plugin_name.c_str()); + env_ = GlobalThreadManager::acquire_jni_env_for_plugin(jvm, plugin_name); + is_valid_ = (env_ != nullptr); + OBP_LOG_INFO("[%s] ScopedJNIEnvironment %s", plugin_name.c_str(), is_valid_ ? "SUCCESS" : "FAILED"); + } else { + OBP_LOG_ERROR("[%s] JVM is null, cannot acquire JNI environment", plugin_name.c_str()); + } +} + +ScopedJNIEnvironment::~ScopedJNIEnvironment() { + OBP_LOG_INFO("[%s] ScopedJNIEnvironment destructor called", plugin_name_.c_str()); + if (env_) { + JavaVM* jvm = GlobalJVMManager::get_jvm(); + if (jvm) { + OBP_LOG_INFO("[%s] Releasing JNI environment", plugin_name_.c_str()); + GlobalThreadManager::release_jni_env_for_plugin(jvm, plugin_name_); + } + } + OBP_LOG_INFO("[%s] ScopedJNIEnvironment destructor completed", plugin_name_.c_str()); +} + +jstring JNIUtils::cpp_string_to_jstring(JNIEnv* env, const std::string& str) { + if (!env) { + return nullptr; + } + + jstring jstr = env->NewStringUTF(str.c_str()); + std::string error_msg; + if (check_and_handle_exception(env, error_msg)) { + OBP_LOG_ERROR("Failed to create Java string: %s", error_msg.c_str()); + return nullptr; + } + + return jstr; +} + +std::string JNIUtils::jstring_to_cpp_string(JNIEnv* env, jstring jstr) { + if (!env || !jstr) { + return std::string(); + } + + const char* chars = env->GetStringUTFChars(jstr, nullptr); + if (!chars) { + std::string error_msg; + check_and_handle_exception(env, error_msg); + return std::string(); + } + + std::string result(chars); + env->ReleaseStringUTFChars(jstr, chars); + + return result; +} + +int JNIUtils::jstring_array_to_cpp_vector(JNIEnv* env, jobjectArray jarray, + std::vector& result) { + if (!env || !jarray) { + return -1; + } + + jsize length = env->GetArrayLength(jarray); + std::string error_msg; + if (check_and_handle_exception(env, error_msg)) { + OBP_LOG_ERROR("Failed to get array length: %s", error_msg.c_str()); + return -1; + } + + result.clear(); + result.reserve(length); + + // Process array in batches to prevent local reference accumulation + const jsize BATCH_SIZE = 32; + + for (jsize start = 0; start < length; start += BATCH_SIZE) { + if (env->PushLocalFrame(BATCH_SIZE) < 0) { + OBP_LOG_ERROR("Failed to push JNI local reference frame"); + return -1; + } + + jsize end = (start + BATCH_SIZE < length) ? start + BATCH_SIZE : length; + + for (jsize i = start; i < end; i++) { + jstring jstr = (jstring)env->GetObjectArrayElement(jarray, i); + if (check_and_handle_exception(env, error_msg)) { + OBP_LOG_ERROR("Failed to get array element %d: %s", i, error_msg.c_str()); + env->PopLocalFrame(nullptr); + return -1; + } + + if (jstr) { + std::string str = jstring_to_cpp_string(env, jstr); + result.push_back(str); + } + } + + env->PopLocalFrame(nullptr); + } + + return 0; +} + +bool JNIUtils::check_and_handle_exception(JNIEnv* env, std::string& error_message) { + if (!env || !env->ExceptionCheck()) { + return false; + } + + // Get exception object + jthrowable exception = env->ExceptionOccurred(); + env->ExceptionClear(); + + if (exception) { + // Get exception class and toString method + jclass throwable_class = env->GetObjectClass(exception); + jmethodID to_string_method = env->GetMethodID(throwable_class, "toString", "()Ljava/lang/String;"); + + if (to_string_method) { + jstring exception_string = (jstring)env->CallObjectMethod(exception, to_string_method); + if (exception_string) { + error_message = jstring_to_cpp_string(env, exception_string); + OBP_LOG_WARN("Java exception occurred: %s", error_message.c_str()); + } + } + + // Clean up local references + env->DeleteLocalRef(throwable_class); + env->DeleteLocalRef(exception); + } + + return true; +} + +std::string JNIUtils::get_class_name(JNIEnv* env, jclass clazz) { + if (!env || !clazz) { + return ""; + } + + jclass class_class = env->FindClass("java/lang/Class"); + jmethodID get_name_method = env->GetMethodID(class_class, "getName", "()Ljava/lang/String;"); + + if (get_name_method) { + jstring class_name_jstr = (jstring)env->CallObjectMethod(clazz, get_name_method); + if (class_name_jstr) { + return jstring_to_cpp_string(env, class_name_jstr); + } + } + + return ""; +} + +} // namespace jni +} // namespace oceanbase diff --git a/common/liboceanbase_jni_common/jni_manager.h b/common/liboceanbase_jni_common/jni_manager.h new file mode 100644 index 0000000..bb16c9f --- /dev/null +++ b/common/liboceanbase_jni_common/jni_manager.h @@ -0,0 +1,256 @@ +/** + * Copyright (c) 2023 OceanBase + * OceanBase JNI Common Library - Global JVM and Thread Management + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace oceanbase { +namespace jni { + +/** + * JNI Configuration Utilities + * @brief Provides unified configuration management for all plugins + */ +class JNIConfigUtils { +public: + /** + * Get unified Java classpath + * @return Classpath string, checks OCEANBASE_JNI_CLASSPATH env var first + */ + static std::string get_unified_classpath(); + + /** + * Get unified JVM maximum heap size + * @return Max heap size in MB, checks OCEANBASE_JNI_MAX_HEAP env var first + */ + static size_t get_unified_max_heap_mb(); + + /** + * Get unified JVM initial heap size + * @return Initial heap size in MB, checks OCEANBASE_JNI_INIT_HEAP env var first + */ + static size_t get_unified_init_heap_mb(); + +private: + /** + * Build dynamic classpath by scanning directory + * @param base_dir Base directory (e.g., "./java") + * @return Dynamically built classpath string + */ + static std::string build_dynamic_classpath(const std::string& base_dir); +}; + +/** + * Global JVM Manager + * @brief Centralized JVM lifecycle management for all plugins with JNI environment. + * @details This class manages the global JVM instance and ensures + * proper creation, sharing, and cleanup across multiple plugin instances. + */ +class GlobalJVMManager { +public: + /** + * Get or create the global JVM instance + * @param classpath Java classpath for JVM initialization + * @param max_heap_mb Maximum heap size in MB + * @param init_heap_mb Initial heap size in MB + * @return Pointer to the global JVM instance, or nullptr on failure + */ + static JavaVM* get_or_create_jvm(const std::string& classpath, + size_t max_heap_mb = 512, + size_t init_heap_mb = 128); + + /** + * Register a plugin using the JVM + * @param plugin_name Name of the plugin + */ + static void register_plugin(const std::string& plugin_name); + + /** + * Unregister a plugin and cleanup if it's the last one + * @param plugin_name Name of the plugin + */ + static void unregister_plugin(const std::string& plugin_name); + + /** + * Get the current plugin count + */ + static int get_plugin_count(); + + /** + * Force shutdown JVM (only for testing or emergency) + */ + static void force_shutdown_jvm(); + + /** + * Get the global JVM instance (if exists) + */ + static JavaVM* get_jvm(); + +private: + static std::mutex global_mutex_; + static JavaVM* shared_jvm_; + static std::atomic plugin_count_; + static bool jvm_created_by_us_; + static std::unordered_set registered_plugins_; + + // Configuration consistency tracking + static std::string first_instance_classpath_; + static size_t first_instance_max_heap_mb_; + static size_t first_instance_init_heap_mb_; + static bool config_recorded_; + + // Helper method for configuration validation + static bool validate_config_consistency(const std::string& classpath, + size_t max_heap_mb, + size_t init_heap_mb); + + // Disable construction + GlobalJVMManager() = delete; + ~GlobalJVMManager() = delete; +}; + +/** + * Global Thread Manager + * @brief Manages JNI environment pointers for each thread globally + * @details Ensures proper thread attachment/detachment coordination + * across multiple plugins using reference counting. + */ +class GlobalThreadManager { +public: + /** + * Acquire JNI environment for current thread for a plugin + * @param jvm The JVM instance to attach to + * @param plugin_name Name of the plugin requesting the environment + * @return JNI environment pointer for current thread, or nullptr on failure + */ + static JNIEnv* acquire_jni_env_for_plugin(JavaVM* jvm, const std::string& plugin_name); + + /** + * Release JNI environment for current thread for a plugin + * @param jvm The JVM instance to detach from + * @param plugin_name Name of the plugin releasing the environment + */ + static void release_jni_env_for_plugin(JavaVM* jvm, const std::string& plugin_name); + + /** + * Get the reference count for a specific thread + * @param tid Thread ID to query + * @return Reference count for the thread + */ + static int get_thread_ref_count(std::thread::id tid); + + /** + * Get the number of threads currently attached + */ + static int get_attached_thread_count(); + +private: + static std::mutex thread_mutex_; + static std::unordered_map global_thread_ref_count_; + static std::unordered_set attached_threads_; + // TODO: thread_plugin_map_ is currently not utilized, kept for future debugging/monitoring needs + // static std::unordered_map> thread_plugin_map_; + + // Disable construction + GlobalThreadManager() = delete; + ~GlobalThreadManager() = delete; +}; + +/** + * RAII-style JNI Environment Management + * @brief Automatic JNI environment acquisition and release + * @details This class provides RAII-style management of JNI environments + * with automatic cleanup and exception safety. + */ +class ScopedJNIEnvironment { +private: + JNIEnv* env_; + std::string plugin_name_; + bool is_valid_; + +public: + /** + * Constructor - automatically acquires JNI environment + * @param plugin_name Name of the plugin + * @param classpath Java classpath (optional, uses existing JVM if empty) + * @param max_heap_mb Maximum heap size in MB + * @param init_heap_mb Initial heap size in MB + */ + ScopedJNIEnvironment(const std::string& plugin_name, + const std::string& classpath = "", + size_t max_heap_mb = 512, + size_t init_heap_mb = 128); + + /** + * Destructor - automatically releases JNI environment + */ + ~ScopedJNIEnvironment(); + + /** + * Get the JNI environment pointer + */ + JNIEnv* get() const { return env_; } + + /** + * Check if the environment is valid + */ + operator bool() const { return is_valid_; } + + /** + * Check if the environment is valid + */ + bool is_valid() const { return is_valid_; } + + // Disable copy and move + ScopedJNIEnvironment(const ScopedJNIEnvironment&) = delete; + ScopedJNIEnvironment& operator=(const ScopedJNIEnvironment&) = delete; + ScopedJNIEnvironment(ScopedJNIEnvironment&&) = delete; + ScopedJNIEnvironment& operator=(ScopedJNIEnvironment&&) = delete; +}; + +/** + * JNI Utility Functions + * @brief Common JNI utility functions + */ +class JNIUtils { +public: + /** + * Convert C++ string to Java string + */ + static jstring cpp_string_to_jstring(JNIEnv* env, const std::string& str); + + /** + * Convert Java string to C++ string + */ + static std::string jstring_to_cpp_string(JNIEnv* env, jstring jstr); + + /** + * Convert Java string array to C++ vector + */ + static int jstring_array_to_cpp_vector(JNIEnv* env, jobjectArray jarray, + std::vector& result); + + /** + * Check and handle Java exceptions + */ + static bool check_and_handle_exception(JNIEnv* env, std::string& error_message); + + /** + * Get class name from jclass + */ + static std::string get_class_name(JNIEnv* env, jclass clazz); +}; + +} // namespace jni +} // namespace oceanbase diff --git a/japanese_ftparser/CMakeLists.txt b/japanese_ftparser/CMakeLists.txt new file mode 100644 index 0000000..bb2ccf4 --- /dev/null +++ b/japanese_ftparser/CMakeLists.txt @@ -0,0 +1,65 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.22) + +# Set plugin name +SET(PLUGIN_NAME japanese_ftparser) + +# Source files +SET(SOURCES + japanese_ftparser_main.cpp + japanese_jni_bridge.cpp +) + +# Project configuration +PROJECT(${PLUGIN_NAME} + DESCRIPTION "Japanese ftparser plugin JNI management" + HOMEPAGE_URL "https://open.oceanbase.com/" + LANGUAGES CXX C ASM) + +# Find required packages +FIND_PACKAGE(ObPlugin REQUIRED) +FIND_PACKAGE(JNI REQUIRED COMPONENTS JVM) + +# Use OB_ADD_PLUGIN macro (provided by ObPlugin) +OB_ADD_PLUGIN(${PLUGIN_NAME} + ${SOURCES} +) + +# Add include directories for common JNI library +TARGET_INCLUDE_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + ${JNI_INCLUDE_DIRS} +) + +# Link libraries +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE + ${JNI_LIBRARIES} +) + +# Add link directory for common JNI library +TARGET_LINK_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build +) + +# Link the common JNI library +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE oceanbase_jni_common) + +# Set C++ standard and RPATH +SET_TARGET_PROPERTIES(${PLUGIN_NAME} PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_VISIBILITY_PRESET default + # RPATH settings: use $ORIGIN to find libraries in the same directory as the plugin + BUILD_RPATH "\$ORIGIN:${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build" + INSTALL_RPATH "\$ORIGIN" + BUILD_WITH_INSTALL_RPATH FALSE +) + +# Add custom command to ensure common library is built first +ADD_CUSTOM_TARGET(build_common_jni_lib + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build --parallel + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + COMMENT "Building common JNI library" +) + +# Make plugin depend on common library +ADD_DEPENDENCIES(${PLUGIN_NAME} build_common_jni_lib) \ No newline at end of file diff --git a/japanese_ftparser/README.en-US.md b/japanese_ftparser/README.en-US.md new file mode 100644 index 0000000..f2baa4c --- /dev/null +++ b/japanese_ftparser/README.en-US.md @@ -0,0 +1,230 @@ +# OceanBase Japanese Fulltext Parser Plugin + +A Japanese fulltext parser plugin for OceanBase. It uses JNI bridge to call Java tokenization libraries (integrated with Apache Lucene JapaneseAnalyzer/Kuromoji). + +## Features + +- ✅ Compatible with OceanBase FTParser interface (`japanese_ftparser_main.cpp`) +- ✅ JNI integration for Java Japanese tokenization (Lucene JapaneseAnalyzer/Kuromoji) +- ✅ UTF-8 multi-byte Japanese text processing +- ✅ Extensible: Java tokenization implementation can be replaced + +## Build + +### Environment Setup + +1. Install basic build tools +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +This command installs the gcc development environment. + +> Skip this step if your environment already has these tools + +2. Install OceanBase Plugin Development Kit +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### Compilation + +```bash +# Choose your workspace directory +cd `your/workspace` +# Download source code +git clone https://github.com/oceanbase/oceanbase-plugins +# Build +cd oceanbase-plugins/japanese_ftparser +mkdir build +cd build +cmake .. +make +``` +You will see the libjapanese_ftparser.so file in the build directory. This is the dynamic library plugin. + +## Quick Start + +### Deployment and Installation + +**Recommended method**: Copy .class files and jar files to corresponding locations separately + +```bash +# 1. Copy plugin dynamic library +cp /path/to/yourplugindirpath/libjapanese_ftparser.so /path/to/observer/plugin_dir/ + +# 2. Create java directory structure (if not exists) +mkdir -p /path/to/observer/java/lib + +# 3. Copy Lucene dependency libraries +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-kuromoji-8.11.2.jar /path/to/observer/java/lib/ + +# 4. Copy Japanese segmenter class file +cp java/JapaneseSegmenter.class /path/to/observer/java/ + +# 5. Install Java environment +yum install java-1.8.0-openjdk-devel -y + +# 6. Start Observer and load plugin +# Connect to database +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # Example with dify database connection info + +# Set plugin loading in sys tenant +ALTER SYSTEM SET plugins_load='libjapanese_ftparser.so:on'; + +# Restart Observer to take effect +killall observer +cd /path/to/observer +./bin/observer # Start observer in the observer working directory + +# Verify installation (see below) +``` + +**Multi-plugin Coexistence Notes**: +- If other language parser plugins are already installed, only copy Japanese-specific jar files and .class files +- `lucene-core-8.11.2.jar` and `lucene-analyzers-common-8.11.2.jar` are shared by all plugins +- `lucene-analyzers-kuromoji-8.11.2.jar` is only needed by the Japanese parser +- When files already exist, cp command will ask for overwrite confirmation, you can choose to skip + + +> 📖 **Detailed Plugin Usage**: Refer to [OceanBase Plugin Development Kit User Manual](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### Dependency Search Priority + +The plugin automatically searches for Java dependencies in the following priority order: + +1. **Environment Variable** (Highest Priority) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-kuromoji-8.11.2.jar:/custom/path" + ``` + +2. **Observer Working Directory** (Recommended) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-kuromoji-8.11.2.jar + ${OB_WORKDIR}/java/JapaneseSegmenter.class + ``` + +3. **Plugin Relative Path** (Development Environment) + ``` + ./java/lib/lucene-*.jar + ``` + +**Recommendation**: Use method 2 (copy java directory), no need to configure OCEANBASE_PARSER_CLASSPATH for quick experience + +### Installation Verification + +```sql +-- Check if plugin is loaded successfully +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'japanese_ftparser'; + +-- Create test table (ensure shell character encoding is UTF-8) +CREATE TABLE t_japanese ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER japanese_ftparser +); + +-- Insert Japanese test data +INSERT INTO t_japanese (c1, c2, c3) VALUES +(1, 'こんにちは', 'こんにちは、私たちのウェブサイトへようこそ'), +(2, 'ありがとう', 'ご訪問いただきありがとうございます'), +(3, 'お問い合わせ', 'ご質問がございましたら、営業日にお気軽にお問い合わせください'), +(4, 'ありがとうございます', 'サービスをご利用いただきありがとうございます'), +(5, 'ようこそ', 'OceanBaseへようこそ'), +(6, 'こんにちは', 'こんにちは、またお会いできることを楽しみにしています'), +(7, 'いかがですか', '最近いかがお過ごしでしょうか'), +(8, '問題ありません', '何も問題ありません'), +(9, 'フォーム入力', '情報を完全に入力してください'), +(10, 'ありがとうございました', 'ありがとうございました、また将来お会いできることを願っています'), +(11, 'こんにちは', 'こんにちは、こんにちは'), +(12, '繰り返しテスト', 'パーサーが重複する単語を適切に処理するかテストします'), +(13, '何でもいい', 'あなたが必要とするものは何でも'), +(14, '誰も理解しない', '何が起こっているのか誰も理解していません'), +(15, '通常通り', 'すべて通常通りです'), +(16, '2025年は良い年', '2025年は開発にとって良い年です'), +(17, '2025年ありがとう', '2025年ありがとうございます'), +(18, 'OceanBaseデータベース', 'OceanBaseデータベースを選ぶ理由'); + +-- Test fulltext search functionality +SELECT TOKENIZE('ご質問がございましたら、営業日にお気軽にお問い合わせください','japanese_ftparser', '[{"output": "all"}]'); + +-- Test 1: Single word matching (expected to return c1 = 3) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('気軽' IN NATURAL LANGUAGE MODE); + +-- Test 2: Multiple word matching (expected to return c1 = 1, 5, 6, 11) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('こんにちは ようこそ' IN NATURAL LANGUAGE MODE); + +-- Test 3: Stopword test (should return no results if "の" is a stopword) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('の' IN NATURAL LANGUAGE MODE); + +-- Test 4: Number + Japanese mixed (expected to return c1 = 16, 17) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('2025' IN NATURAL LANGUAGE MODE); + +-- Test 5: Repeated words (expected to return c1 = 12) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('テスト' IN NATURAL LANGUAGE MODE); + +-- Test 6: Fuzzy matching (expected to return c1 = 2, 4, 10, 17) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('ありがとう' IN NATURAL LANGUAGE MODE); + +-- Test 7: Verify key Japanese sentence search (expected to return c1 = 18) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('理由' IN NATURAL LANGUAGE MODE); +``` + +## Technical Specifications + +### ES Complete Solution + +The Japanese tokenizer adopts the Elasticsearch complete solution: + +``` +Configuration: CustomAnalyzer.builder() + .withTokenizer("japanese") // kuromoji_tokenizer + .addTokenFilter("japaneseBaseForm") // kuromoji_baseform + .addTokenFilter("japanesePartOfSpeechStop") // kuromoji_part_of_speech + .addTokenFilter("cjkWidth") // cjk_width + .addTokenFilter("lowercase") // lowercase + .addTokenFilter("stop") // ja_stop + +Features: +- BaseForm stemming: Unifies verb/adjective inflections to their base forms +- Stopword removal: Removes functional words like particles and pronouns +- Character width normalization: Unifies full-width and half-width characters +- Lowercase conversion: Unifies alphabet cases +``` + +### Alignment with Dify Configuration + +This plugin is fully aligned with Dify's Elasticsearch Japanese configuration: + +```json +{ + "analysis": { + "analyzer": { + "ja_analyzer": { + "type": "custom", + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "kuromoji_baseform", // ✅ Implemented + "kuromoji_part_of_speech", // ✅ Implemented + "ja_stop", // ✅ Implemented + "kuromoji_number", // Future enhancement + "kuromoji_stemmer" // Future enhancement + ] + } + } + } +} +``` + +**A Japanese tokenization solution optimized for database fulltext search**. diff --git a/japanese_ftparser/README.ja-JP.md b/japanese_ftparser/README.ja-JP.md new file mode 100644 index 0000000..578be7b --- /dev/null +++ b/japanese_ftparser/README.ja-JP.md @@ -0,0 +1,230 @@ +# OceanBase 日本語全文解析プラグイン(Japanese Fulltext Parser Plugin) + +OceanBase向けの日本語全文解析プラグインです。JNIブリッジを使用してJava分かち書きライブラリ(Apache Lucene JapaneseAnalyzer/Kuromoji統合済み)を呼び出します。 + +## 機能特性 + +- ✅ OceanBase FTParser インターフェース対応(`japanese_ftparser_main.cpp`) +- ✅ JNI統合によるJava日本語分かち書き(Lucene JapaneseAnalyzer/Kuromoji) +- ✅ UTF-8マルチバイト日本語処理 +- ✅ 拡張可能:Java分かち書き実装の置き換え可能 + +## コンパイル + +### 環境準備 + +1. 基本コンパイル環境のインストール +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +このコマンドでgcc開発環境がインストールされます。 + +> 環境が既に整っている場合はこの手順をスキップできます + +2. OceanBase プラグイン開発キットのインストール +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### コンパイル + +```bash +# 作業ディレクトリを選択 +cd `your/workspace` +# ソースコードをダウンロード +git clone https://github.com/oceanbase/oceanbase-plugins +# コンパイル +cd oceanbase-plugins/japanese_ftparser +mkdir build +cd build +cmake .. +make +``` +buildディレクトリに`libjapanese_ftparser.so`ファイルが作成されます。これが動的ライブラリプラグインです。 + +## クイックスタート + +### デプロイとインストール + +**推奨方法**:.classファイルとjarファイルをそれぞれ適切な場所にコピー + +```bash +# 1. プラグイン動的ライブラリをコピー +cp /path/to/yourplugindirpath/libjapanese_ftparser.so /path/to/observer/plugin_dir/ + +# 2. javaディレクトリ構造を作成(存在しない場合) +mkdir -p /path/to/observer/java/lib + +# 3. Lucene依存ライブラリをコピー +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-kuromoji-8.11.2.jar /path/to/observer/java/lib/ + +# 4. 日本語分かち書きクラスファイルをコピー +cp java/JapaneseSegmenter.class /path/to/observer/java/ + +# 5. Java環境をインストール +yum install java-1.8.0-openjdk-devel -y + +# 6. Observer を起動してプラグインを読み込み +# データベースに接続 +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # difyのデータベース接続情報の例 + +# sys テナントでプラグイン読み込みを設定 +ALTER SYSTEM SET plugins_load='libjapanese_ftparser.so:on'; + +# Observer を再起動して有効化 +killall observer +cd /path/to/observer +./bin/observer # observerワーキングディレクトリでobserverを起動 + +# インストール確認(下記参照) +``` + +**マルチプラグイン共存説明**: +- 他の言語分かち書きプラグインが既にインストールされている場合は、日本語専用のjarファイルと.classファイルのみをコピー +- `lucene-core-8.11.2.jar` と `lucene-analyzers-common-8.11.2.jar` は全プラグインで共有 +- `lucene-analyzers-kuromoji-8.11.2.jar` は日本語分かち書きのみで必要 +- ファイルが既に存在する場合、cpコマンドが上書きを確認するので、スキップを選択可能 + + +> 📖 **詳細なプラグイン使用説明**:[OceanBase Plugin Development Kit ユーザーマニュアル](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) を参照 + +### 依存関係の検索優先順位 + +プラグインは以下の優先順位でJava依存関係を自動検索します: + +1. **環境変数**(最高優先度) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-kuromoji-8.11.2.jar:/custom/path" + ``` + +2. **Observer ワーキングディレクトリ**(推奨) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-kuromoji-8.11.2.jar + ${OB_WORKDIR}/java/JapaneseSegmenter.class + ``` + +3. **プラグイン相対パス**(開発環境) + ``` + ./java/lib/lucene-*.jar + ``` + +**推奨**: 方式2(javaディレクトリのコピー)を使用し、`OCEANBASE_PARSER_CLASSPATH`の設定不要でクイック体験が可能 + +### インストール確認 + +```sql +-- プラグインが正常に読み込まれているかチェック +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'japanese_ftparser'; + +-- テストテーブル作成(シェルの文字セットエンコーディングはUTF-8を使用) +CREATE TABLE t_japanese ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER japanese_ftparser +); + +-- 日本語テストデータの挿入 +INSERT INTO t_japanese (c1, c2, c3) VALUES +(1, 'こんにちは', 'こんにちは、私たちのウェブサイトへようこそ'), +(2, 'ありがとう', 'ご訪問いただきありがとうございます'), +(3, 'お問い合わせ', 'ご質問がございましたら、営業日にお気軽にお問い合わせください'), +(4, 'ありがとうございます', 'サービスをご利用いただきありがとうございます'), +(5, 'ようこそ', 'OceanBaseへようこそ'), +(6, 'こんにちは', 'こんにちは、またお会いできることを楽しみにしています'), +(7, 'いかがですか', '最近いかがお過ごしでしょうか'), +(8, '問題ありません', '何も問題ありません'), +(9, 'フォーム入力', '情報を完全に入力してください'), +(10, 'ありがとうございました', 'ありがとうございました、また将来お会いできることを願っています'), +(11, 'こんにちは', 'こんにちは、こんにちは'), +(12, '繰り返しテスト', 'パーサーが重複する単語を適切に処理するかテストします'), +(13, '何でもいい', 'あなたが必要とするものは何でも'), +(14, '誰も理解しない', '何が起こっているのか誰も理解していません'), +(15, '通常通り', 'すべて通常通りです'), +(16, '2025年は良い年', '2025年は開発にとって良い年です'), +(17, '2025年ありがとう', '2025年ありがとうございます'), +(18, 'OceanBaseデータベース', 'OceanBaseデータベースを選ぶ理由'); + +-- 全文検索機能のテスト +SELECT TOKENIZE('ご質問がございましたら、営業日にお気軽にお問い合わせください','japanese_ftparser', '[{"output": "all"}]'); + +-- テスト 1:単語マッチング(c1 = 3 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('気軽' IN NATURAL LANGUAGE MODE); + +-- テスト 2:複数語マッチング(c1 = 1, 5, 6, 11 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('こんにちは ようこそ' IN NATURAL LANGUAGE MODE); + +-- テスト 3:ストップワードテスト(「の」がストップワードの場合は結果なし) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('の' IN NATURAL LANGUAGE MODE); + +-- テスト 4:数字 + 日本語混合(c1 = 16, 17 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('2025' IN NATURAL LANGUAGE MODE); + +-- テスト 5:重複語句(c1 = 12 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('テスト' IN NATURAL LANGUAGE MODE); + +-- テスト 6:あいまいマッチング(c1 = 2, 4, 10, 17 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('ありがとう' IN NATURAL LANGUAGE MODE); + +-- テスト 7:重要な日本語文検索確認(c1 = 18 が返される予定) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('理由' IN NATURAL LANGUAGE MODE); +``` + +## 技術仕様 + +### ES完全ソリューション + +日本語分かち書きはElasticsearch完全ソリューションを採用: + +``` +設定: CustomAnalyzer.builder() + .withTokenizer("japanese") // kuromoji_tokenizer + .addTokenFilter("japaneseBaseForm") // kuromoji_baseform + .addTokenFilter("japanesePartOfSpeechStop") // kuromoji_part_of_speech + .addTokenFilter("cjkWidth") // cjk_width + .addTokenFilter("lowercase") // lowercase + .addTokenFilter("stop") // ja_stop + +特徴: +- BaseForm語幹抽出: 動詞・形容詞の活用形を原形に統一 +- ストップワード除去: 助詞・代名詞等の機能語を除去 +- 文字幅正規化: 全角・半角の統一 +- 小文字変換: アルファベットの統一 +``` + +### Dify設定との整合 + +このプラグインはDifyのElasticsearch日本語設定と完全に整合: + +```json +{ + "analysis": { + "analyzer": { + "ja_analyzer": { + "type": "custom", + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "kuromoji_baseform", // ✅ 実装済み + "kuromoji_part_of_speech", // ✅ 実装済み + "ja_stop", // ✅ 実装済み + "kuromoji_number", // 将来拡張予定 + "kuromoji_stemmer" // 将来拡張予定 + ] + } + } + } +} +``` + +**データベース全文検索に最適化された日本語分かち書きソリューション**です。 diff --git a/japanese_ftparser/README.zh-CN.md b/japanese_ftparser/README.zh-CN.md new file mode 100644 index 0000000..9802bc6 --- /dev/null +++ b/japanese_ftparser/README.zh-CN.md @@ -0,0 +1,180 @@ +# OceanBase 日语全文解析插件(Japanese Fulltext Parser Plugin) + +面向 OceanBase 的日语全文解析插件。核心使用 JNI 桥接 Java 分词库(已集成 Apache Lucene JapaneseAnalyzer/Kuromoji)。 + +## 功能特性 + +- ✅ 兼容 OceanBase FTParser 接口(`japanese_ftparser_main.cpp`) +- ✅ JNI 集成调用 Java 日语分词(Lucene JapaneseAnalyzer/Kuromoji) +- ✅ UTF-8 多字节日文处理 +- ✅ 可扩展:可替换 Java 分词实现 + +## 编译 + +### 环境准备 + +1. 安装编译基础 +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +该命令将会安装gcc开发环境。 + +> 如果你的环境已经具备可以跳过当前步骤 + +2. 安装OceanBase 插件开发套件 +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### 编译 + +```bash +# 选择一个你自己的工作目录 +cd `your/workspace` +# 下载源码 +git clone https://github.com/oceanbase/oceanbase-plugins +# 编译 +cd oceanbase-plugins/japanese_ftparser +mkdir build +cd build +cmake .. +make +``` +你将会在build目录下看到libjapanese_ftparser.so文件,这个就是动态库插件。 + +## 快速开始 + +### 部署安装 + +**推荐方法**:分别复制.class文件和jar文件到对应位置 + +```bash +# 1. 复制插件动态库 +cp /path/to/yourplugindirpath/libjapanese_ftparser.so /path/to/observer/plugin_dir/ + +# 2. 创建java目录结构(如果不存在) +mkdir -p /path/to/observer/java/lib + +# 3. 复制Lucene依赖库 +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-kuromoji-8.11.2.jar /path/to/observer/java/lib/ + +# 4. 复制日语分词器类文件 +cp java/JapaneseSegmenter.class /path/to/observer/java/ + +# 5. 安装Java环境 +yum install java-1.8.0-openjdk-devel -y + +# 6. 启动 Observer 并加载插件 +# 连接数据库 +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # 这里以dify的数据库连接信息为例 + +# 在sys租户中设置插件加载 +ALTER SYSTEM SET plugins_load='libjapanese_ftparser.so:on'; + +# 重启Observer生效 +killall observer +cd /path/to/observer +./bin/observer # 在observer工作目录执行启动observer + +# 验证安装(见下文) +``` + +**多插件共存说明**: +- 如果已安装其他语言分词插件,只需复制日语专用的jar包和.class文件 +- `lucene-core-8.11.2.jar` 和 `lucene-analyzers-common-8.11.2.jar` 被所有插件共享 +- `lucene-analyzers-kuromoji-8.11.2.jar` 仅日语分词器需要 +- 文件已存在时,cp命令会询问是否覆盖,可选择跳过 + + +> 📖 **详细插件使用说明**:参考 [OceanBase Plugin Development Kit 用户手册](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### 依赖寻找优先级 + +插件按以下优先级自动寻找Java依赖: + +1. ** 环境变量**(最高优先级) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-kuromoji-8.11.2.jar:/custom/path" + ``` + +2. ** Observer 工作目录**(推荐) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-kuromoji-8.11.2.jar + ${OB_WORKDIR}/java/JapaneseSegmenter.class + ``` + +3. ** 插件相对路径**(开发环境) + ``` + ./java/lib/lucene-*.jar + ``` + +** 建议**:使用方式2(复制java目录),无需配置OCEANBASE_PARSER_CLASSPATH,快速体验 + +### 验证安装 + +```sql +-- 检查插件是否加载成功 +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'japanese_ftparser'; + +-- 创建测试表(注意shell的字符集编码用UTF-8) +CREATE TABLE t_japanese ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER japanese_ftparser +); + +-- 插入日语测试数据 +INSERT INTO t_japanese (c1, c2, c3) VALUES +(1, 'こんにちは', 'こんにちは、私たちのウェブサイトへようこそ'), +(2, 'ありがとう', 'ご訪問いただきありがとうございます'), +(3, 'お問い合わせ', 'ご質問がございましたら、営業日にお気軽にお問い合わせください'), +(4, 'ありがとうございます', 'サービスをご利用いただきありがとうございます'), +(5, 'ようこそ', 'OceanBaseへようこそ'), +(6, 'こんにちは', 'こんにちは、またお会いできることを楽しみにしています'), +(7, 'いかがですか', '最近いかがお過ごしでしょうか'), +(8, '問題ありません', '何も問題ありません'), +(9, 'フォーム入力', '情報を完全に入力してください'), +(10, 'ありがとうございました', 'ありがとうございました、また将来お会いできることを願っています'), +(11, 'こんにちは', 'こんにちは、こんにちは'), +(12, '繰り返しテスト', 'パーサーが重複する単語を適切に処理するかテストします'), +(13, '何でもいい', 'あなたが必要とするものは何でも'), +(14, '誰も理解しない', '何が起こっているのか誰も理解していません'), +(15, '通常通り', 'すべて通常通りです'), +(16, '2025年は良い年', '2025年は開発にとって良い年です'), +(17, '2025年ありがとう', '2025年ありがとうございます'), +(18, 'OceanBaseデータベース', 'OceanBaseデータベースを選ぶ理由'); +-- 测试全文搜索功能 +SELECT TOKENIZE('ご質問がございましたら、営業日にお気軽にお問い合わせください','japanese_ftparser', '[{"output": "all"}]'); +-- 测试 1:匹配单个词(预计返回 c1 = 3) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('気軽' IN NATURAL LANGUAGE MODE); + +-- 测试 2:匹配多个词(预计返回 c1 = 1, 5, 6, 11) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('こんにちは ようこそ' IN NATURAL LANGUAGE MODE); + +-- 测试 3:停用词测试(如果 "の" 是停用词,应该无结果) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('の' IN NATURAL LANGUAGE MODE); + +-- 测试 4:数字 + 日语混合(预计返回 c1 = 16, 17) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('2025' IN NATURAL LANGUAGE MODE); + +-- 测试 5:重复词语(预计返回 c1 = 12) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('テスト' IN NATURAL LANGUAGE MODE); + +-- 测试 6:模糊匹配(预计返回 c1 = 2, 4, 10, 17) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('ありがとう' IN NATURAL LANGUAGE MODE); + +-- 测试 7:验证关键日语句子搜索(预计返回 c1 = 18) +SELECT * FROM t_japanese +WHERE MATCH(c2, c3) AGAINST('理由' IN NATURAL LANGUAGE MODE); +``` \ No newline at end of file diff --git a/japanese_ftparser/japanese_ftparser_main.cpp b/japanese_ftparser/japanese_ftparser_main.cpp new file mode 100644 index 0000000..9074ad9 --- /dev/null +++ b/japanese_ftparser/japanese_ftparser_main.cpp @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2023 OceanBase + * Japanese Fulltext Parser Plugin - Main Entry Point + */ + +#include "japanese_jni_bridge.h" + +extern "C" { + +/** + * Plugin initialization function + * @param plugin Plugin parameter pointer + * @return OBP_SUCCESS on success, error code on failure + */ +int plugin_init_jp(ObPluginParamPtr plugin) +{ + int ret = OBP_SUCCESS; + + if (0 == plugin) { + ret = OBP_INVALID_ARGUMENT; + return ret; + } + + // Define the ftparser plugin descriptor + ObPluginFTParser parser = { + .init = japanese_ftparser_init, + .deinit = japanese_ftparser_deinit, + .scan_begin = japanese_ftparser_scan_begin, + .scan_end = japanese_ftparser_scan_end, + .next_token = japanese_ftparser_next_token, + .get_add_word_flag = japanese_ftparser_get_add_word_flag + }; + + // Register the ftparser plugin with OceanBase + ret = OBP_REGISTER_FTPARSER(plugin, + "japanese_ftparser", + parser, + "Japanese language fulltext parser"); + + return ret; +} + +// Plugin declaration +OBP_DECLARE_PLUGIN(japanese_ftparser) +{ + OBP_AUTHOR_OCEANBASE, // Plugin author + OBP_MAKE_VERSION(0, 0, 1), // Plugin version + OBP_LICENSE_MULAN_PSL_V2, // Plugin license + plugin_init_jp, // Plugin initialization function + nullptr, // Plugin deinitialization function (not needed) +} OBP_DECLARE_PLUGIN_END; + +} // extern "C" diff --git a/japanese_ftparser/japanese_jni_bridge.cpp b/japanese_ftparser/japanese_jni_bridge.cpp new file mode 100644 index 0000000..f2d3573 --- /dev/null +++ b/japanese_ftparser/japanese_jni_bridge.cpp @@ -0,0 +1,393 @@ +/** + * Copyright (c) 2023 OceanBase + * Japanese Fulltext Parser Plugin + */ + +#include "japanese_jni_bridge.h" +#include +#include +#include +#include +#include + +namespace oceanbase { +namespace japanese_ftparser { + +// Configuration implementation +JapaneseJNIBridgeConfig::JapaneseJNIBridgeConfig() + : segmenter_class_name("JapaneseSegmenter") + , segment_method_name("segment") { + // JVM configurations are now managed by JNIConfigUtils in common library +} + +// JapaneseJNIBridge implementation +JapaneseJNIBridge::JapaneseJNIBridge() + : plugin_name_("japanese_ftparser") + , is_initialized_(false) + , segmenter_class_(nullptr) + , segment_method_(nullptr) { + clear_error(); +} + +JapaneseJNIBridge::~JapaneseJNIBridge() { + if (is_initialized_) { + // Unregister from global JVM manager + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + is_initialized_ = false; + } +} + +int JapaneseJNIBridge::initialize() { + std::lock_guard lock(bridge_mutex_); + + if (is_initialized_) { + return OBP_SUCCESS; + } + + clear_error(); + + // Register with global JVM manager + oceanbase::jni::GlobalJVMManager::register_plugin(plugin_name_); + + // Create scoped JNI environment using unified configuration from common library + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for initialization"); + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return OBP_PLUGIN_ERROR; + } + + // Load Java classes and cache method IDs + int ret = load_java_classes(jni_env.get()); + if (ret != OBP_SUCCESS) { + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return ret; + } + + is_initialized_ = true; + OBP_LOG_INFO("Simplified JNI Bridge initialized successfully"); + return OBP_SUCCESS; +} + +int JapaneseJNIBridge::segment(const std::string& text, std::vector& tokens) { + if (!is_initialized_) { + set_error(OBP_PLUGIN_ERROR, "Bridge not initialized"); + return OBP_PLUGIN_ERROR; + } + + clear_error(); + + // Create scoped JNI environment for this operation + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for segmentation"); + return OBP_PLUGIN_ERROR; + } + + return do_segment(jni_env.get(), text, tokens); +} + +int JapaneseJNIBridge::load_java_classes(JNIEnv* env) { + if (!env) { + set_error(OBP_PLUGIN_ERROR, "JNI environment is null"); + return OBP_PLUGIN_ERROR; + } + + // Find segmenter class + segmenter_class_ = env->FindClass(config_.segmenter_class_name.c_str()); + std::string error_msg; + if (!segmenter_class_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + std::string msg = "Cannot find Java class: " + config_.segmenter_class_name; + if (!error_msg.empty()) { + msg += " (" + error_msg + ")"; + } + set_error(OBP_PLUGIN_ERROR, msg); + return OBP_PLUGIN_ERROR; + } + + // Make it a global reference to prevent GC + segmenter_class_ = (jclass)env->NewGlobalRef(segmenter_class_); + if (!segmenter_class_) { + set_error(OBP_PLUGIN_ERROR, "Failed to create global reference for segmenter class"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Get static segment method ID instead of constructor and instance method + std::string segment_signature = "(Ljava/lang/String;)[Ljava/lang/String;"; + segment_method_ = env->GetStaticMethodID(segmenter_class_, + config_.segment_method_name.c_str(), + segment_signature.c_str()); + if (!segment_method_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + std::string msg = "Cannot find STATIC method: " + config_.segment_method_name + + " with signature: " + segment_signature; + if (!error_msg.empty()) { + msg += " (" + error_msg + ")"; + } + set_error(OBP_PLUGIN_ERROR, msg); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: No constructor method needed for static approach + + OBP_LOG_INFO("Java classes loaded successfully"); + return OBP_SUCCESS; +} + +int JapaneseJNIBridge::do_segment(JNIEnv* env, const std::string& text, std::vector& tokens) { + if (!env) { + set_error(OBP_PLUGIN_ERROR, "JNI environment is null"); + return OBP_PLUGIN_ERROR; + } + + // Convert C++ string to Java string + jstring jtext = oceanbase::jni::JNIUtils::cpp_string_to_jstring(env, text); + if (!jtext) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert text to Java string"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Call static Java segmentation method - no instance creation! + jobjectArray jresult = (jobjectArray)env->CallStaticObjectMethod( + segmenter_class_, segment_method_, jtext); + + // Check for Java exceptions + std::string error_msg; + if (oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + std::string msg = "Static Java segmentation method threw exception"; + if (!error_msg.empty()) { + msg += " (" + error_msg + ")"; + } + set_error(OBP_PLUGIN_ERROR, msg); + // Clean up local references before returning + // Note: jresult is nullptr when exception occurs, no need to clean it + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + if (!jresult) { + set_error(OBP_PLUGIN_ERROR, "Static Java segmentation method returned null"); + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + // Convert Java string array to C++ vector (JNIUtils handles its own Frame management) + int ret = oceanbase::jni::JNIUtils::jstring_array_to_cpp_vector(env, jresult, tokens); + if (ret != 0) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert Java result to C++ vector"); + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + return OBP_PLUGIN_ERROR; + } + + // Clean up our local references + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + + OBP_LOG_INFO("Segmentation completed, got %zu tokens", tokens.size()); + return OBP_SUCCESS; +} + +void JapaneseJNIBridge::set_error(int code, const std::string& message) { + last_error_.error_code = code; + last_error_.error_message = message; + // 移除直接输出,让上层统一处理错误输出 +} + +void JapaneseJNIBridge::clear_error() { + last_error_.error_code = OBP_SUCCESS; + last_error_.error_message.clear(); + last_error_.java_exception.clear(); +} + +// JapaneseJNIBridgeManager implementation +JapaneseJNIBridgeManager& JapaneseJNIBridgeManager::get_instance() { + static JapaneseJNIBridgeManager instance; + return instance; +} + +std::shared_ptr JapaneseJNIBridgeManager::get_bridge() { + std::lock_guard lock(mutex_); + if (!bridge_) { + bridge_ = std::make_shared(); + } + return bridge_; +} + +int JapaneseJNIBridgeManager::initialize() { + auto bridge = get_bridge(); + return bridge ? bridge->initialize() : OBP_PLUGIN_ERROR; +} + +// Plugin parser structure +struct JapaneseParserState { + std::vector tokens; + size_t current_token_index; + + JapaneseParserState() : current_token_index(0) {} +}; + +} // namespace japanese_ftparser +} // namespace oceanbase + +// Plugin interface implementation +extern "C" { + +int japanese_ftparser_init(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Don't initialize JVM here - do it lazily on first use (scan_begin) + // This avoids issues with classpath when Observer is starting up + OBP_LOG_INFO("Japanese FTParser plugin registered (JVM will be initialized on first use)"); + return OBP_SUCCESS; +} + +int japanese_ftparser_deinit(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + OBP_LOG_INFO("Japanese FTParser plugin deinitialized"); + return OBP_SUCCESS; +} + +int japanese_ftparser_scan_begin(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Lazy initialization: initialize JVM on first actual use + auto& manager = oceanbase::japanese_ftparser::JapaneseJNIBridgeManager::get_instance(); + int ret = manager.initialize(); + if (ret != OBP_SUCCESS) { + OBP_LOG_WARN("Failed to initialize JNI bridge on first use (error_code: %d)", ret); + return ret; + } + + // Create parser instance + oceanbase::japanese_ftparser::JapaneseParserState* jp = new (std::nothrow) oceanbase::japanese_ftparser::JapaneseParserState(); + if (!jp) { + return OBP_ALLOCATE_MEMORY_FAILED; + } + + // Get text to parse + const char* doc = obp_ftparser_fulltext(param); + int64_t length = obp_ftparser_fulltext_length(param); + + if (!doc || length <= 0) { + delete jp; + return OBP_INVALID_ARGUMENT; + } + + std::string text(doc, length); + + // Segment text using simplified JNI bridge + auto bridge = manager.get_bridge(); + if (!bridge) { + delete jp; + return OBP_PLUGIN_ERROR; + } + + ret = bridge->segment(text, jp->tokens); + if (ret != OBP_SUCCESS) { + const auto& error = bridge->get_last_error(); + OBP_LOG_WARN("Segmentation failed: %s (error_code: %d)", + error.error_message.c_str(), error.error_code); + delete jp; + return ret; + } + + jp->current_token_index = 0; + + // Store parser instance in user data + obp_ftparser_set_user_data(param, jp); + + OBP_LOG_INFO("Segmentation completed: %zu tokens extracted from %zu characters", + jp->tokens.size(), text.length()); + return OBP_SUCCESS; +} + +int japanese_ftparser_scan_end(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::japanese_ftparser::JapaneseParserState* jp = (oceanbase::japanese_ftparser::JapaneseParserState*)obp_ftparser_user_data(param); + if (jp) { + delete jp; + obp_ftparser_set_user_data(param, nullptr); + } + + return OBP_SUCCESS; +} + +int japanese_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq) { + if (!param || !word || !word_len || !char_cnt || !word_freq) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::japanese_ftparser::JapaneseParserState* jp = (oceanbase::japanese_ftparser::JapaneseParserState*)obp_ftparser_user_data(param); + if (!jp) { + return OBP_PLUGIN_ERROR; + } + + if (jp->current_token_index >= jp->tokens.size()) { + return OBP_ITER_END; + } + + const std::string& token = jp->tokens[jp->current_token_index++]; + + // Set word properties + *word = const_cast(token.c_str()); + *word_len = token.length(); + + // Calculate character count (UTF-8 character count, not byte count) + // Use the same algorithm as the original plugin + int64_t char_count = 0; + const char* p = token.c_str(); + const char* end = p + token.length(); + + while (p < end) { + unsigned char c = *p; + + if ((c & 0x80) == 0) { + p += 1; // ASCII character + } else if ((c & 0xE0) == 0xC0) { + p += 2; // 2-byte UTF-8 character + } else if ((c & 0xF0) == 0xE0) { + p += 3; // 3-byte UTF-8 character (most CJK characters) + } else if ((c & 0xF8) == 0xF0) { + p += 4; // 4-byte UTF-8 character + } else { + p += 1; // Invalid UTF-8, skip + continue; // Don't count invalid characters + } + + char_count++; + } + + *char_cnt = char_count; // Set character count + *word_freq = 1; // Set word frequency to 1 + + return OBP_SUCCESS; +} + +int japanese_ftparser_get_add_word_flag(uint64_t *flag) { + if (!flag) { + return OBP_INVALID_ARGUMENT; + } + + // Set flags for Japanese language processing (same as original plugin) + // Disable problematic filters that may reject Japanese characters + *flag = OBP_FTPARSER_AWF_CASEDOWN // Convert to lowercase (safe for Japanese) + | OBP_FTPARSER_AWF_GROUPBY_WORD; // Group identical words (safe for Japanese) + // Disabled: OBP_FTPARSER_AWF_MIN_MAX_WORD (may filter Japanese chars by length) + // Disabled: OBP_FTPARSER_AWF_STOPWORD (may use inappropriate stopword list) + + return OBP_SUCCESS; +} + +} // extern "C" diff --git a/japanese_ftparser/japanese_jni_bridge.h b/japanese_ftparser/japanese_jni_bridge.h new file mode 100644 index 0000000..df859f0 --- /dev/null +++ b/japanese_ftparser/japanese_jni_bridge.h @@ -0,0 +1,161 @@ +/** + * Copyright (c) 2023 OceanBase + * Experimental Japanese Fulltext Parser Plugin - Simplified Version + */ + +#pragma once + +#include "oceanbase/ob_plugin_ftparser.h" +#include "jni_manager.h" // Simplified include path +#include +#include +#include + +namespace oceanbase { +namespace japanese_ftparser { + +/** + * Japanese JNI Bridge Configuration + * @brief Plugin-specific configuration for Japanese text segmentation + * @details JVM-level configurations are now managed by JNIConfigUtils in common library + */ +struct JapaneseJNIBridgeConfig { + // Plugin-specific configurations only + std::string segmenter_class_name; + std::string segment_method_name; + + JapaneseJNIBridgeConfig(); +}; + +/** + * Japanese JNI Bridge for Japanese Segmentation + * @brief Japanese parser using the common JNI library + * @details This class uses oceanbase::jni::ScopedJNIEnvironment for + * automatic JVM and thread management, eliminating the need for + * complex manual resource management. + */ +class JapaneseJNIBridge { +private: + JapaneseJNIBridgeConfig config_; + std::string plugin_name_; + bool is_initialized_; + std::mutex bridge_mutex_; + + // Java class and method references (cached for performance) + jclass segmenter_class_; + jmethodID segment_method_; // Static method + + // Error handling + struct ErrorInfo { + int error_code; + std::string error_message; + std::string java_exception; + }; + ErrorInfo last_error_; + +public: + /** + * Constructor + */ + JapaneseJNIBridge(); + + /** + * Destructor + */ + ~JapaneseJNIBridge(); + + /** + * Initialize the simplified JNI bridge + * @return OBP_SUCCESS on success, error code on failure + */ + int initialize(); + + /** + * Segment text into tokens + * @param text Input text to segment + * @param tokens Output vector to store tokens + * @return OBP_SUCCESS on success, error code on failure + */ + int segment(const std::string& text, std::vector& tokens); + + /** + * Get last error information + */ + const ErrorInfo& get_last_error() const { return last_error_; } + + /** + * Check if bridge is initialized + */ + bool is_initialized() const { return is_initialized_; } + +private: + /** + * Load Java classes and cache method IDs + */ + int load_java_classes(JNIEnv* env); + + /** + * Perform actual segmentation with given JNI environment + */ + int do_segment(JNIEnv* env, const std::string& text, std::vector& tokens); + + /** + * Set error information + */ + void set_error(int code, const std::string& message); + + /** + * Clear error information + */ + void clear_error(); + + // Disable copy and move + JapaneseJNIBridge(const JapaneseJNIBridge&) = delete; + JapaneseJNIBridge& operator=(const JapaneseJNIBridge&) = delete; +}; + +/** + * Japanese JNI Bridge Manager (Singleton) + * @brief Manages a global Japanese JNI bridge instance + */ +class JapaneseJNIBridgeManager { +public: + /** + * Get the singleton instance + */ + static JapaneseJNIBridgeManager& get_instance(); + + /** + * Get the JNI bridge + */ + std::shared_ptr get_bridge(); + + /** + * Initialize the bridge (thread-safe) + */ + int initialize(); + +private: + std::shared_ptr bridge_; + std::mutex mutex_; + + JapaneseJNIBridgeManager() = default; + ~JapaneseJNIBridgeManager() = default; + + // Disable copy + JapaneseJNIBridgeManager(const JapaneseJNIBridgeManager&) = delete; + JapaneseJNIBridgeManager& operator=(const JapaneseJNIBridgeManager&) = delete; +}; + +} // namespace japanese_ftparser +} // namespace oceanbase + +// Plugin interface functions (declarations only) +extern "C" { + int japanese_ftparser_init(ObPluginParamPtr param); + int japanese_ftparser_deinit(ObPluginParamPtr param); + int japanese_ftparser_scan_begin(ObPluginFTParserParamPtr param); + int japanese_ftparser_scan_end(ObPluginFTParserParamPtr param); + int japanese_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq); + int japanese_ftparser_get_add_word_flag(uint64_t *flag); +} \ No newline at end of file diff --git a/japanese_ftparser/java/JapaneseSegmenter.class b/japanese_ftparser/java/JapaneseSegmenter.class new file mode 100644 index 0000000..fc60eac Binary files /dev/null and b/japanese_ftparser/java/JapaneseSegmenter.class differ diff --git a/japanese_ftparser/java/JapaneseSegmenter.java b/japanese_ftparser/java/JapaneseSegmenter.java new file mode 100644 index 0000000..b722f83 --- /dev/null +++ b/japanese_ftparser/java/JapaneseSegmenter.java @@ -0,0 +1,90 @@ +import java.io.StringReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * Japanese Segmenter using static method to reduce memory usage + */ +public class JapaneseSegmenter { + // Simple, direct static initialization - created only once per JVM + private static final Analyzer STATIC_ANALYZER = createAnalyzer(); + + /** + * Create analyzer - invoked only once when the class is loaded + */ + private static Analyzer createAnalyzer() { + try { + System.out.println("JapaneseSegmenter: Creating static analyzer (once per JVM)"); + return CustomAnalyzer.builder() + .withTokenizer("japanese") // kuromoji_tokenizer + .addTokenFilter("japaneseBaseForm") // kuromoji_baseform + .addTokenFilter("japanesePartOfSpeechStop") // kuromoji_part_of_speech + .addTokenFilter("cjkWidth") // cjk_width + .addTokenFilter("lowercase") // lowercase + .addTokenFilter("stop", "words", "org/apache/lucene/analysis/ja/stopwords.txt") // ja_stop + .build(); + } catch (Exception e) { + System.err.println("Failed to initialize JapaneseSegmenter: " + e.getMessage()); + e.printStackTrace(); + throw new RuntimeException("Cannot initialize JapaneseAnalyzer", e); + } + } + + /** + * Segment Japanese text into tokens + * @param text The input text to segment + * @return Array of segmented tokens + */ + public static String[] segment(String text) { + if (text == null || text.trim().isEmpty()) { + return new String[0]; + } + + List tokens = new ArrayList<>(); + + try (TokenStream tokenStream = STATIC_ANALYZER.tokenStream("content", new StringReader(text))) { + CharTermAttribute termAttr = tokenStream.addAttribute(CharTermAttribute.class); + + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String token = termAttr.toString(); + tokens.add(token); + } + tokenStream.end(); + + } catch (IOException e) { + System.err.println("Error during tokenization: " + e.getMessage()); + return new String[0]; + } + + return tokens.toArray(new String[0]); + } + + public static boolean isStaticInitialized() { + return STATIC_ANALYZER != null; + } + + public static void main(String[] args) { + // Test cases + String[] result1 = JapaneseSegmenter.segment("Hello world"); + System.out.println("English: " + java.util.Arrays.toString(result1)); + + String[] result2 = JapaneseSegmenter.segment("私は学生です"); + System.out.println("Japanese: " + java.util.Arrays.toString(result2)); + + String[] result3 = JapaneseSegmenter.segment("Hello こんにちは world"); + System.out.println("Mixed: " + java.util.Arrays.toString(result3)); + + String[] result4 = JapaneseSegmenter.segment("東京都渋谷区でコンピューターを勉強しています"); + System.out.println("Complex: " + java.util.Arrays.toString(result4)); + + String[] result5 = JapaneseSegmenter.segment("OceanBaseデータベースを選ぶ理由"); + System.out.println("OceanBase: " + java.util.Arrays.toString(result5)); + } +} \ No newline at end of file diff --git a/japanese_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar b/japanese_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar new file mode 100644 index 0000000..d0f85d2 Binary files /dev/null and b/japanese_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar differ diff --git a/japanese_ftparser/java/lib/lucene-analyzers-kuromoji-8.11.2.jar b/japanese_ftparser/java/lib/lucene-analyzers-kuromoji-8.11.2.jar new file mode 100644 index 0000000..ec6574d Binary files /dev/null and b/japanese_ftparser/java/lib/lucene-analyzers-kuromoji-8.11.2.jar differ diff --git a/japanese_ftparser/java/lib/lucene-core-8.11.2.jar b/japanese_ftparser/java/lib/lucene-core-8.11.2.jar new file mode 100644 index 0000000..f91d691 Binary files /dev/null and b/japanese_ftparser/java/lib/lucene-core-8.11.2.jar differ diff --git a/korean_ftparser/CMakeLists.txt b/korean_ftparser/CMakeLists.txt new file mode 100644 index 0000000..a8e22c0 --- /dev/null +++ b/korean_ftparser/CMakeLists.txt @@ -0,0 +1,68 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.22) + +# Set plugin name +SET(PLUGIN_NAME korean_ftparser) + +# Source files +SET(SOURCES + korean_ftparser_main.cpp + korean_jni_bridge.cpp +) + +# Project configuration +PROJECT(${PLUGIN_NAME} + DESCRIPTION "Korean ftparser plugin" + HOMEPAGE_URL "https://open.oceanbase.com/" + LANGUAGES CXX C ASM) + +# Find required packages +FIND_PACKAGE(ObPlugin REQUIRED) +FIND_PACKAGE(JNI REQUIRED COMPONENTS JVM) + +# Add common JNI library as a subdirectory +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common build_common_jni_lib) + +# Use OB_ADD_PLUGIN macro (provided by ObPlugin) +OB_ADD_PLUGIN(${PLUGIN_NAME} + ${SOURCES} +) + +# Add include directories for common JNI library +TARGET_INCLUDE_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + ${JNI_INCLUDE_DIRS} +) + +# Link libraries +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE + ${JNI_LIBRARIES} +) + +# Add link directory for common JNI library +TARGET_LINK_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build +) + +# Link the common JNI library +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE oceanbase_jni_common) + +# Set C++ standard and RPATH +SET_TARGET_PROPERTIES(${PLUGIN_NAME} PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_VISIBILITY_PRESET default + # RPATH settings: use $ORIGIN to find libraries in the same directory as the plugin + BUILD_RPATH "\$ORIGIN:${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build" + INSTALL_RPATH "\$ORIGIN" + BUILD_WITH_INSTALL_RPATH FALSE +) + +# Add custom command to ensure common library is built first +ADD_CUSTOM_TARGET(build_common_jni_lib_korean + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build --parallel + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + COMMENT "Building common JNI library for Korean parser" +) + +# Make plugin depend on common library +ADD_DEPENDENCIES(${PLUGIN_NAME} build_common_jni_lib_korean) diff --git a/korean_ftparser/README.en-US.md b/korean_ftparser/README.en-US.md new file mode 100644 index 0000000..132bf02 --- /dev/null +++ b/korean_ftparser/README.en-US.md @@ -0,0 +1,219 @@ +# OceanBase Korean Fulltext Parser Plugin + +A Korean fulltext parser plugin for OceanBase. It uses JNI bridge to call Java tokenization libraries (integrated with Apache Lucene KoreanAnalyzer/Nori). + +## Features + +- ✅ Compatible with OceanBase FTParser interface (`korean_ftparser_main.cpp`) +- ✅ JNI integration for Java Korean tokenization (Lucene KoreanAnalyzer/Nori) +- ✅ UTF-8 multi-byte Korean text processing +- ✅ MIXED compound word mode: preserves both complete words and decomposed words +- ✅ Extensible: Java tokenization implementation can be replaced + +## Build + +### Environment Setup + +1. Install basic build tools +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +This command installs the gcc development environment. + +> Skip this step if your environment already has these tools + +2. Install OceanBase Plugin Development Kit +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### Compilation + +```bash +# Choose your workspace directory +cd `your/workspace` +# Download source code +git clone https://github.com/oceanbase/oceanbase-plugins +# Build +cd oceanbase-plugins/korean_ftparser +mkdir build +cd build +cmake .. +make +``` +You will see the libkorean_ftparser.so file in the build directory. This is the dynamic library plugin. + +## Quick Start + +### Deployment and Installation + +**Recommended method**: Copy .class files and jar files to corresponding locations separately + +```bash +# 1. Copy plugin dynamic library +cp /path/to/yourplugindirpath/libkorean_ftparser.so /path/to/observer/plugin_dir/ + +# 2. Create java directory structure (if not exists) +mkdir -p /path/to/observer/java/lib + +# 3. Copy Lucene dependency libraries +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-nori-8.11.2.jar /path/to/observer/java/lib/ + +# 4. Copy Korean segmenter class file +cp java/KoreanSegmenter.class /path/to/observer/java/ + +# 5. Install Java environment +yum install java-1.8.0-openjdk-devel -y + +# 6. Start Observer and load plugin +# Connect to database +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # Example with dify database connection info + +# Set plugin loading in sys tenant +ALTER SYSTEM SET plugins_load='libkorean_ftparser.so:on'; + +# Restart Observer to take effect +killall observer +cd /path/to/observer +./bin/observer # Start observer in the observer working directory + +# Verify installation (see below) +``` + +**Multi-plugin Coexistence Notes**: +- If other language parser plugins are already installed, only copy Korean-specific jar files and .class files +- `lucene-core-8.11.2.jar` and `lucene-analyzers-common-8.11.2.jar` are shared by all plugins +- `lucene-analyzers-nori-8.11.2.jar` is only needed by the Korean parser +- When files already exist, cp command will ask for overwrite confirmation, you can choose to skip + + +> 📖 **Detailed Plugin Usage**: Refer to [OceanBase Plugin Development Kit User Manual](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### Dependency Search Priority + +The plugin automatically searches for Java dependencies in the following priority order: + +1. **Environment Variable** (Highest Priority) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-nori-8.11.2.jar:/custom/path" + ``` + +2. **Observer Working Directory** (Recommended) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-nori-8.11.2.jar + ${OB_WORKDIR}/java/KoreanSegmenter.class + ``` + +3. **Plugin Relative Path** (Development Environment) + ``` + ./java/lib/lucene-*.jar + ``` + +**Recommendation**: Use method 2 (copy java directory), no need to configure OCEANBASE_PARSER_CLASSPATH for quick experience + +### MIXED Mode Features + +The Korean tokenizer uses **MIXED compound word mode** with the following advantages: + +- **Precise Search**: Preserves complete compound words, e.g., `데이터베이스` +- **Flexible Search**: Also provides decomposed words, e.g., `데이터`, `베이스` +- **Optimal Balance**: Supports both exact matching and partial matching + +### Installation Verification + +```sql +-- Check if plugin is loaded successfully +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'korean_ftparser'; + +-- Create test table (ensure shell character encoding is UTF-8) +CREATE TABLE t_korean ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER korean_ftparser +); + +-- Insert Korean test data +INSERT INTO t_korean (c1, c2, c3) VALUES +(1, '안녕하세요', '안녕하세요, 저희 웹사이트에 오신 것을 환영합니다'), +(2, '감사합니다', '방문해 주셔서 감사합니다'), +(3, '문의사항', '질문이 있으시면 언제든지 연락해 주세요'), +(4, '고맙습니다', '서비스를 이용해 주셔서 고맙습니다'), +(5, '환영합니다', 'OceanBase에 오신 것을 환영합니다'), +(6, '안녕하세요', '안녕하세요, 다시 만나뵙게 되어 기쁩니다'), +(7, '어떠세요', '요즘 어떻게 지내세요'), +(8, '문제없습니다', '아무 문제가 없습니다'), +(9, '입력양식', '정보를 완전히 입력해 주세요'), +(10, '감사했습니다', '감사했습니다, 앞으로도 만나뵐 수 있기를 바랍니다'), +(11, '데이터베이스', 'OceanBase 데이터베이스 관리 시스템'), +(12, '자연언어처리', '한국어 자연언어처리 기술'), +(13, '컴퓨터과학', '컴퓨터과학과 소프트웨어공학'), +(14, '기계학습', '기계학습과 인공지능의 발전'), +(15, '소프트웨어개발', '소프트웨어개발 방법론'); + +-- Test tokenization functionality +SELECT TOKENIZE('데이터베이스 관리 시스템','korean_ftparser', '[{"output": "all"}]'); + +-- Test 1: Compound word exact matching (expected to return c1 = 11) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터베이스' IN NATURAL LANGUAGE MODE); + +-- Test 2: Compound word partial matching (expected to return c1 = 11) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터' IN NATURAL LANGUAGE MODE); + +-- Test 3: Multi-word search (expected to return related results) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('안녕하세요 환영합니다' IN NATURAL LANGUAGE MODE); + +-- Test 4: Technical term search (expected to return c1 = 12) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('자연언어처리' IN NATURAL LANGUAGE MODE); + +-- Test 5: Compound technical vocabulary (expected to return c1 = 13) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('컴퓨터과학' IN NATURAL LANGUAGE MODE); + +-- Test 6: Technology vocabulary search (expected to return c1 = 14) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('기계학습' IN NATURAL LANGUAGE MODE); + +-- Test 7: Development-related vocabulary (expected to return c1 = 15) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('소프트웨어개발' IN NATURAL LANGUAGE MODE); + +-- Test 8: Verify MIXED mode advantages +-- Searching "베이스" should match "데이터베이스" +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('베이스' IN NATURAL LANGUAGE MODE); +``` + +## Technical Features + +### MIXED Compound Word Mode + +The Korean tokenizer adopts Lucene Nori's MIXED mode: + +``` +Input: "데이터베이스" +Output: ["데이터베이스", "데이터", "베이스"] + +Advantages: +- Precise search: "데이터베이스" → matches complete word +- Flexible search: "데이터" → matches part of compound word +- Grammar integrity: preserves Korean grammatical structure +``` + +### Comparison with Other Modes + +| Mode | Output Example | Search Characteristics | +|------|---------------|----------------------| +| **MIXED** | `[데이터베이스, 데이터, 베이스]` | Precise + Flexible | +| NONE | `[데이터베이스]` | Precise only | +| DISCARD | `[데이터, 베이스]` | Decomposed only | + +**MIXED mode is most suitable for database fulltext search scenarios**. diff --git a/korean_ftparser/README.ko-KR.md b/korean_ftparser/README.ko-KR.md new file mode 100644 index 0000000..9dc00ab --- /dev/null +++ b/korean_ftparser/README.ko-KR.md @@ -0,0 +1,219 @@ +# OceanBase 한국어 전문 분석 플러그인 (Korean Fulltext Parser Plugin) + +OceanBase를 위한 한국어 전문 분석 플러그인입니다. JNI 브리지를 사용하여 Java 형태소 분석 라이브러리 (Apache Lucene KoreanAnalyzer/Nori 통합)를 호출합니다. + +## 기능 + +- ✅ OceanBase FTParser 인터페이스 호환 (`korean_ftparser_main.cpp`) +- ✅ JNI 통합을 통한 Java 한국어 형태소 분석 (Lucene KoreanAnalyzer/Nori) +- ✅ UTF-8 멀티바이트 한글 처리 +- ✅ MIXED 복합어 모드: 완전한 단어와 분해된 단어를 동시에 보존 +- ✅ 확장 가능: Java 형태소 분석 구현 교체 가능 + +## 컴파일 + +### 환경 준비 + +1. 기본 컴파일 도구 설치 +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +이 명령어는 gcc 개발 환경을 설치합니다. + +> 환경이 이미 준비되어 있다면 이 단계를 건너뛸 수 있습니다 + +2. OceanBase 플러그인 개발 키트 설치 +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### 컴파일 + +```bash +# 작업 디렉토리 선택 +cd `your/workspace` +# 소스 코드 다운로드 +git clone https://github.com/oceanbase/oceanbase-plugins +# 컴파일 +cd oceanbase-plugins/korean_ftparser +mkdir build +cd build +cmake .. +make +``` +build 디렉토리에 libkorean_ftparser.so 파일이 생성됩니다. 이것이 동적 라이브러리 플러그인입니다. + +## 빠른 시작 + +### 배포 및 설치 + +**권장 방법**: .class 파일과 jar 파일을 해당 위치에 개별적으로 복사 + +```bash +# 1. 플러그인 동적 라이브러리 복사 +cp /path/to/yourplugindirpath/libkorean_ftparser.so /path/to/observer/plugin_dir/ + +# 2. java 디렉토리 구조 생성 (존재하지 않는 경우) +mkdir -p /path/to/observer/java/lib + +# 3. Lucene 의존성 라이브러리 복사 +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-nori-8.11.2.jar /path/to/observer/java/lib/ + +# 4. 한국어 형태소 분석기 클래스 파일 복사 +cp java/KoreanSegmenter.class /path/to/observer/java/ + +# 5. Java 환경 설치 +yum install java-1.8.0-openjdk-devel -y + +# 6. Observer 시작 및 플러그인 로드 +# 데이터베이스 연결 +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # dify 데이터베이스 연결 정보 예시 + +# sys 테넌트에서 플러그인 로딩 설정 +ALTER SYSTEM SET plugins_load='libkorean_ftparser.so:on'; + +# Observer 재시작하여 적용 +killall observer +cd /path/to/observer +./bin/observer # observer 작업 디렉토리에서 observer 시작 + +# 설치 확인 (아래 참조) +``` + +**멀티 플러그인 공존 설명**: +- 다른 언어 형태소 분석 플러그인이 이미 설치되어 있다면, 한국어 전용 jar 파일과 .class 파일만 복사 +- `lucene-core-8.11.2.jar`와 `lucene-analyzers-common-8.11.2.jar`는 모든 플러그인에서 공유 +- `lucene-analyzers-nori-8.11.2.jar`는 한국어 형태소 분석기에서만 필요 +- 파일이 이미 존재하는 경우 cp 명령어가 덮어쓰기를 확인하므로, 건너뛰기를 선택할 수 있습니다 + + +> 📖 **자세한 플러그인 사용 설명**: [OceanBase Plugin Development Kit 사용자 매뉴얼](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) 참조 + +### 의존성 검색 우선순위 + +플러그인은 다음 우선순위로 Java 의존성을 자동 검색합니다: + +1. **환경 변수** (최고 우선순위) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-nori-8.11.2.jar:/custom/path" + ``` + +2. **Observer 작업 디렉토리** (권장) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-nori-8.11.2.jar + ${OB_WORKDIR}/java/KoreanSegmenter.class + ``` + +3. **플러그인 상대 경로** (개발 환경) + ``` + ./java/lib/lucene-*.jar + ``` + +**권장사항**: 방식 2 (java 디렉토리 복사) 사용, OCEANBASE_PARSER_CLASSPATH 설정 불필요로 빠른 체험 + +### MIXED 모드 특성 + +한국어 형태소 분석기는 **MIXED 복합어 모드**를 사용하여 다음과 같은 장점을 제공합니다: + +- **정확한 검색**: 완전한 복합어 보존, 예: `데이터베이스` +- **유연한 검색**: 분해된 단어도 동시 제공, 예: `데이터`, `베이스` +- **최적의 균형**: 정확한 매칭과 부분 매칭을 모두 지원 + +### 설치 확인 + +```sql +-- 플러그인이 성공적으로 로드되었는지 확인 +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'korean_ftparser'; + +-- 테스트 테이블 생성 (셸의 문자셋 인코딩은 UTF-8 사용) +CREATE TABLE t_korean ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER korean_ftparser +); + +-- 한국어 테스트 데이터 삽입 +INSERT INTO t_korean (c1, c2, c3) VALUES +(1, '안녕하세요', '안녕하세요, 저희 웹사이트에 오신 것을 환영합니다'), +(2, '감사합니다', '방문해 주셔서 감사합니다'), +(3, '문의사항', '질문이 있으시면 언제든지 연락해 주세요'), +(4, '고맙습니다', '서비스를 이용해 주셔서 고맙습니다'), +(5, '환영합니다', 'OceanBase에 오신 것을 환영합니다'), +(6, '안녕하세요', '안녕하세요, 다시 만나뵙게 되어 기쁩니다'), +(7, '어떠세요', '요즘 어떻게 지내세요'), +(8, '문제없습니다', '아무 문제가 없습니다'), +(9, '입력양식', '정보를 완전히 입력해 주세요'), +(10, '감사했습니다', '감사했습니다, 앞으로도 만나뵐 수 있기를 바랍니다'), +(11, '데이터베이스', 'OceanBase 데이터베이스 관리 시스템'), +(12, '자연언어처리', '한국어 자연언어처리 기술'), +(13, '컴퓨터과학', '컴퓨터과학과 소프트웨어공학'), +(14, '기계학습', '기계학습과 인공지능의 발전'), +(15, '소프트웨어개발', '소프트웨어개발 방법론'); + +-- 형태소 분석 기능 테스트 +SELECT TOKENIZE('데이터베이스 관리 시스템','korean_ftparser', '[{"output": "all"}]'); + +-- 테스트 1: 복합어 정확 매칭 (c1 = 11 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터베이스' IN NATURAL LANGUAGE MODE); + +-- 테스트 2: 복합어 부분 매칭 (c1 = 11 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터' IN NATURAL LANGUAGE MODE); + +-- 테스트 3: 다중 단어 검색 (관련 결과 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('안녕하세요 환영합니다' IN NATURAL LANGUAGE MODE); + +-- 테스트 4: 전문 용어 검색 (c1 = 12 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('자연언어처리' IN NATURAL LANGUAGE MODE); + +-- 테스트 5: 복합 전문 어휘 (c1 = 13 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('컴퓨터과학' IN NATURAL LANGUAGE MODE); + +-- 테스트 6: 기술 어휘 검색 (c1 = 14 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('기계학습' IN NATURAL LANGUAGE MODE); + +-- 테스트 7: 개발 관련 어휘 (c1 = 15 반환 예상) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('소프트웨어개발' IN NATURAL LANGUAGE MODE); + +-- 테스트 8: MIXED 모드 장점 검증 +-- "베이스" 검색이 "데이터베이스"와 매칭되어야 함 +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('베이스' IN NATURAL LANGUAGE MODE); +``` + +## 기술 특성 + +### MIXED 복합어 모드 + +한국어 형태소 분석기는 Lucene Nori의 MIXED 모드를 채택합니다: + +``` +입력: "데이터베이스" +출력: ["데이터베이스", "데이터", "베이스"] + +장점: +- 정확한 검색: "데이터베이스" → 완전한 단어 매칭 +- 유연한 검색: "데이터" → 복합어의 일부 매칭 +- 문법 완전성: 한국어 문법 구조 보존 +``` + +### 다른 모드와의 비교 + +| 모드 | 출력 예시 | 검색 특성 | +|------|-----------|----------| +| **MIXED** | `[데이터베이스, 데이터, 베이스]` | 정확+유연 | +| NONE | `[데이터베이스]` | 정확만 | +| DISCARD | `[데이터, 베이스]` | 분해만 | + +**MIXED 모드는 데이터베이스 전문 검색 시나리오에 가장 적합합니다**. diff --git a/korean_ftparser/README.zh-CN.md b/korean_ftparser/README.zh-CN.md new file mode 100644 index 0000000..b4110e4 --- /dev/null +++ b/korean_ftparser/README.zh-CN.md @@ -0,0 +1,219 @@ +# OceanBase 韩语全文解析插件(Korean Fulltext Parser Plugin) + +面向 OceanBase 的韩语全文解析插件。核心使用 JNI 桥接 Java 分词库(已集成 Apache Lucene KoreanAnalyzer/Nori)。 + +## 功能特性 + +- ✅ 兼容 OceanBase FTParser 接口(`korean_ftparser_main.cpp`) +- ✅ JNI 集成调用 Java 韩语分词(Lucene KoreanAnalyzer/Nori) +- ✅ UTF-8 多字节韩文处理 +- ✅ MIXED 复合词模式:同时保留完整词和分解词 +- ✅ 可扩展:可替换 Java 分词实现 + +## 编译 + +### 环境准备 + +1. 安装编译基础 +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +该命令将会安装gcc开发环境。 + +> 如果你的环境已经具备可以跳过当前步骤 + +2. 安装OceanBase 插件开发套件 +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### 编译 + +```bash +# 选择一个你自己的工作目录 +cd `your/workspace` +# 下载源码 +git clone https://github.com/oceanbase/oceanbase-plugins +# 编译 +cd oceanbase-plugins/korean_ftparser +mkdir build +cd build +cmake .. +make +``` +你将会在build目录下看到libkorean_ftparser.so文件,这个就是动态库插件。 + +## 快速开始 + +### 部署安装 + +**推荐方法**:分别复制.class文件和jar文件到对应位置 + +```bash +# 1. 复制插件动态库 +cp /path/to/yourplugindirpath/libkorean_ftparser.so /path/to/observer/plugin_dir/ + +# 2. 创建java目录结构(如果不存在) +mkdir -p /path/to/observer/java/lib + +# 3. 复制Lucene依赖库 +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-nori-8.11.2.jar /path/to/observer/java/lib/ + +# 4. 复制韩语分词器类文件 +cp java/KoreanSegmenter.class /path/to/observer/java/ + +# 5. 安装Java环境 +yum install java-1.8.0-openjdk-devel -y + +# 6. 启动 Observer 并加载插件 +# 连接数据库 +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # 这里以dify的数据库连接信息为例 + +# 在sys租户中设置插件加载 +ALTER SYSTEM SET plugins_load='libkorean_ftparser.so:on'; + +# 重启Observer生效 +killall observer +cd /path/to/observer +./bin/observer # 在observer工作目录执行启动observer + +# 验证安装(见下文) +``` + +**多插件共存说明**: +- 如果已安装其他语言分词插件,只需复制韩语专用的jar包和.class文件 +- `lucene-core-8.11.2.jar` 和 `lucene-analyzers-common-8.11.2.jar` 被所有插件共享 +- `lucene-analyzers-nori-8.11.2.jar` 仅韩语分词器需要 +- 文件已存在时,cp命令会询问是否覆盖,可选择跳过 + + +> 📖 **详细插件使用说明**:参考 [OceanBase Plugin Development Kit 用户手册](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### 依赖寻找优先级 + +插件按以下优先级自动寻找Java依赖: + +1. ** 环境变量**(最高优先级) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path/lucene-analyzers-nori-8.11.2.jar:/custom/path" + ``` + +2. ** Observer 工作目录**(推荐) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-nori-8.11.2.jar + ${OB_WORKDIR}/java/KoreanSegmenter.class + ``` + +3. ** 插件相对路径**(开发环境) + ``` + ./java/lib/lucene-*.jar + ``` + +** 建议**:使用方式2(复制java目录),无需配置OCEANBASE_PARSER_CLASSPATH,快速体验 + +### MIXED 模式特性 + +韩语分词器使用 **MIXED 复合词模式**,具有以下优势: + +- **精确搜索**:保留完整复合词,如 `데이터베이스` +- **灵活搜索**:同时提供分解词,如 `데이터`, `베이스` +- **最佳平衡**:既支持精确匹配,又支持部分匹配 + +### 验证安装 + +```sql +-- 检查插件是否加载成功 +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'korean_ftparser'; + +-- 创建测试表(注意shell的字符集编码用UTF-8) +CREATE TABLE t_korean ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER korean_ftparser +); + +-- 插入韩语测试数据 +INSERT INTO t_korean (c1, c2, c3) VALUES +(1, '안녕하세요', '안녕하세요, 저희 웹사이트에 오신 것을 환영합니다'), +(2, '감사합니다', '방문해 주셔서 감사합니다'), +(3, '문의사항', '질문이 있으시면 언제든지 연락해 주세요'), +(4, '고맙습니다', '서비스를 이용해 주셔서 고맙습니다'), +(5, '환영합니다', 'OceanBase에 오신 것을 환영합니다'), +(6, '안녕하세요', '안녕하세요, 다시 만나뵙게 되어 기쁩니다'), +(7, '어떠세요', '요즘 어떻게 지내세요'), +(8, '문제없습니다', '아무 문제가 없습니다'), +(9, '입력양식', '정보를 완전히 입력해 주세요'), +(10, '감사했습니다', '감사했습니다, 앞으로도 만나뵐 수 있기를 바랍니다'), +(11, '데이터베이스', 'OceanBase 데이터베이스 관리 시스템'), +(12, '자연언어처리', '한국어 자연언어처리 기술'), +(13, '컴퓨터과학', '컴퓨터과학과 소프트웨어공학'), +(14, '기계학습', '기계학습과 인공지능의 발전'), +(15, '소프트웨어개발', '소프트웨어개발 방법론'); + +-- 测试分词功能 +SELECT TOKENIZE('데이터베이스 관리 시스템','korean_ftparser', '[{"output": "all"}]'); + +-- 测试 1:复合词精确匹配(预计返回 c1 = 11) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터베이스' IN NATURAL LANGUAGE MODE); + +-- 测试 2:复合词部分匹配(预计返回 c1 = 11) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('데이터' IN NATURAL LANGUAGE MODE); + +-- 测试 3:多词搜索(预计返回相关结果) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('안녕하세요 환영합니다' IN NATURAL LANGUAGE MODE); + +-- 测试 4:专业术语搜索(预计返回 c1 = 12) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('자연언어처리' IN NATURAL LANGUAGE MODE); + +-- 测试 5:复合专业词汇(预计返回 c1 = 13) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('컴퓨터과학' IN NATURAL LANGUAGE MODE); + +-- 测试 6:技术词汇搜索(预计返回 c1 = 14) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('기계학습' IN NATURAL LANGUAGE MODE); + +-- 测试 7:开发相关词汇(预计返回 c1 = 15) +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('소프트웨어개발' IN NATURAL LANGUAGE MODE); + +-- 测试 8:验证MIXED模式优势 +-- 搜索"베이스"应该能匹配到"데이터베이스" +SELECT * FROM t_korean +WHERE MATCH(c2, c3) AGAINST('베이스' IN NATURAL LANGUAGE MODE); +``` + +## 技术特性 + +### MIXED 复合词模式 + +韩语分词器采用 Lucene Nori 的 MIXED 模式: + +``` +输入: "데이터베이스" +输出: ["데이터베이스", "데이터", "베이스"] + +优势: +- 精确搜索: "데이터베이스" → 匹配完整词 +- 灵活搜索: "데이터" → 匹配复合词的一部分 +- 语法完整: 保留韩语语法结构 +``` + +### 与其他模式对比 + +| 模式 | 输出示例 | 搜索特点 | +|------|----------|----------| +| **MIXED** | `[데이터베이스, 데이터, 베이스]` | 精确+灵活 | +| NONE | `[데이터베이스]` | 仅精确 | +| DISCARD | `[데이터, 베이스]` | 仅分解 | + +**MIXED 模式最适合数据库全文检索场景**。 diff --git a/korean_ftparser/java/KoreanSegmenter.class b/korean_ftparser/java/KoreanSegmenter.class new file mode 100644 index 0000000..25f6d63 Binary files /dev/null and b/korean_ftparser/java/KoreanSegmenter.class differ diff --git a/korean_ftparser/java/KoreanSegmenter.java b/korean_ftparser/java/KoreanSegmenter.java new file mode 100644 index 0000000..c1ba185 --- /dev/null +++ b/korean_ftparser/java/KoreanSegmenter.java @@ -0,0 +1,85 @@ +import java.io.StringReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * Korean Segmenter using static method to reduce memory usage + */ +public class KoreanSegmenter { + // 简单直接的静态初始化 - 只创建一次,绝对保证 + private static final Analyzer STATIC_ANALYZER = createAnalyzer(); + + /** + * 创建分析器 - 只在类加载时调用一次 + */ + private static Analyzer createAnalyzer() { + try { + System.out.println("KoreanSegmenter: Creating static analyzer (once per JVM)"); + return CustomAnalyzer.builder() + .withTokenizer("korean", "decompoundMode", "mixed") // MIXED mode! + .addTokenFilter("lowercase") // lowercase (basic normalization) + .build(); + } catch (Exception e) { + System.err.println("Failed to initialize KoreanSegmenter: " + e.getMessage()); + e.printStackTrace(); + throw new RuntimeException("Cannot initialize KoreanAnalyzer", e); + } + } + + /** + * Segment Korean text into tokens + * @param text The Korean text to segment + * @return Array of token strings + */ + public static String[] segment(String text) { + if (text == null || text.trim().isEmpty()) { + return new String[0]; + } + + List tokens = new ArrayList<>(); + + try (TokenStream tokenStream = STATIC_ANALYZER.tokenStream("content", new StringReader(text))) { + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String token = attr.toString(); + tokens.add(token); + } + tokenStream.end(); + + } catch (IOException e) { + System.err.println("Error during Korean tokenization: " + e.getMessage()); + return new String[0]; + } + + return tokens.toArray(new String[0]); + } + + public static boolean isStaticInitialized() { + return STATIC_ANALYZER != null; + } + + public static void main(String[] args) { + // Test cases + String[] testTexts = { + "안녕하세요", + "데이터베이스", + "학교에 갑니다", + "한국어 처리", + "OceanBase 데이터베이스 관리 시스템" + }; + + for (String text : testTexts) { + System.out.println("Testing: \"" + text + "\""); + String[] tokens = KoreanSegmenter.segment(text); + System.out.println("Tokens: " + java.util.Arrays.toString(tokens)); + } + } +} \ No newline at end of file diff --git a/korean_ftparser/java/lib/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar b/korean_ftparser/java/lib/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar new file mode 100644 index 0000000..21de2d4 Binary files /dev/null and b/korean_ftparser/java/lib/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar differ diff --git a/korean_ftparser/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar b/korean_ftparser/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar new file mode 100644 index 0000000..21de2d4 Binary files /dev/null and b/korean_ftparser/java/lib/java/lib/lucene-analyzers-nori-8.11.2.jar differ diff --git a/korean_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar b/korean_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar new file mode 100644 index 0000000..d0f85d2 Binary files /dev/null and b/korean_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar differ diff --git a/korean_ftparser/java/lib/lucene-analyzers-nori-8.11.2.jar b/korean_ftparser/java/lib/lucene-analyzers-nori-8.11.2.jar new file mode 100644 index 0000000..9015e02 Binary files /dev/null and b/korean_ftparser/java/lib/lucene-analyzers-nori-8.11.2.jar differ diff --git a/korean_ftparser/java/lib/lucene-core-8.11.2.jar b/korean_ftparser/java/lib/lucene-core-8.11.2.jar new file mode 100644 index 0000000..f91d691 Binary files /dev/null and b/korean_ftparser/java/lib/lucene-core-8.11.2.jar differ diff --git a/korean_ftparser/korean_ftparser_main.cpp b/korean_ftparser/korean_ftparser_main.cpp new file mode 100644 index 0000000..401f935 --- /dev/null +++ b/korean_ftparser/korean_ftparser_main.cpp @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2023 OceanBase + * Korean Fulltext Parser Plugin - Main Entry Point + */ + +#include "korean_jni_bridge.h" + +extern "C" { + +/** + * Plugin initialization function + * @param plugin Plugin parameter pointer + * @return OBP_SUCCESS on success, error code on failure + */ +int plugin_init_korean(ObPluginParamPtr plugin) +{ + int ret = OBP_SUCCESS; + + if (0 == plugin) { + ret = OBP_INVALID_ARGUMENT; + return ret; + } + + // Define the ftparser plugin descriptor + ObPluginFTParser parser = { + .init = korean_ftparser_init, + .deinit = korean_ftparser_deinit, + .scan_begin = korean_ftparser_scan_begin, + .scan_end = korean_ftparser_scan_end, + .next_token = korean_ftparser_next_token, + .get_add_word_flag = korean_ftparser_get_add_word_flag + }; + + // Register the ftparser plugin with OceanBase + ret = OBP_REGISTER_FTPARSER(plugin, + "korean_ftparser", + parser, + "Korean language fulltext parser"); + + return ret; +} + +// Plugin declaration +OBP_DECLARE_PLUGIN(korean_ftparser) +{ + OBP_AUTHOR_OCEANBASE, // Plugin author + OBP_MAKE_VERSION(0, 0, 1), // Plugin version + OBP_LICENSE_MULAN_PSL_V2, // Plugin license + plugin_init_korean, // Plugin initialization function + nullptr, // Plugin deinitialization function (not needed) +} OBP_DECLARE_PLUGIN_END; + +} // extern "C" diff --git a/korean_ftparser/korean_jni_bridge.cpp b/korean_ftparser/korean_jni_bridge.cpp new file mode 100644 index 0000000..6360cc1 --- /dev/null +++ b/korean_ftparser/korean_jni_bridge.cpp @@ -0,0 +1,365 @@ +/** + * Copyright (c) 2023 OceanBase + * Korean Fulltext Parser Plugin + */ + +#include "korean_jni_bridge.h" +#include +#include +#include +#include +#include + +using namespace oceanbase::jni; + +namespace oceanbase { +namespace korean_ftparser { + +// Configuration implementation +KoreanJNIBridgeConfig::KoreanJNIBridgeConfig() + : segmenter_class_name("KoreanSegmenter") + , segment_method_name("segment") { + // JVM configurations are now managed by JNIConfigUtils in common library +} + +// KoreanJNIBridge implementation +KoreanJNIBridge::KoreanJNIBridge() + : plugin_name_("korean_ftparser") + , is_initialized_(false) + , segmenter_class_(nullptr) + , segment_method_(nullptr) { + clear_error(); +} + +KoreanJNIBridge::~KoreanJNIBridge() { + if (is_initialized_) { + // Unregister from global JVM manager + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + is_initialized_ = false; + } +} + +int KoreanJNIBridge::initialize() { + std::lock_guard lock(bridge_mutex_); + + if (is_initialized_) { + return OBP_SUCCESS; + } + + clear_error(); + + // Register with global JVM manager + oceanbase::jni::GlobalJVMManager::register_plugin(plugin_name_); + + // Create scoped JNI environment using unified configuration from common library + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for Korean parser initialization"); + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return OBP_PLUGIN_ERROR; + } + + // Load Java classes and cache method IDs + int ret = load_java_classes(jni_env.get()); + if (ret != OBP_SUCCESS) { + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return ret; + } + + is_initialized_ = true; + OBP_LOG_INFO("Korean JNI Bridge initialized successfully"); + return OBP_SUCCESS; +} + +int KoreanJNIBridge::segment(const std::string& text, std::vector& tokens) { + if (!is_initialized_) { + set_error(OBP_PLUGIN_ERROR, "Korean JNI Bridge not initialized"); + return OBP_PLUGIN_ERROR; + } + + // Create scoped JNI environment for segmentation + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for Korean segmentation"); + return OBP_PLUGIN_ERROR; + } + + return do_segment(jni_env.get(), text, tokens); +} + +int KoreanJNIBridge::load_java_classes(JNIEnv* env) { + std::string error_msg; + + // Load Korean segmenter class + segmenter_class_ = env->FindClass(config_.segmenter_class_name.c_str()); + if (!segmenter_class_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Failed to find Korean segmenter class '" + + config_.segmenter_class_name + "': " + error_msg); + return OBP_PLUGIN_ERROR; + } + + // Make it a global reference to prevent GC + segmenter_class_ = (jclass)env->NewGlobalRef(segmenter_class_); + if (!segmenter_class_) { + set_error(OBP_PLUGIN_ERROR, "Failed to create global reference for Korean segmenter class"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Get static segment method ID (no constructor needed) + segment_method_ = env->GetStaticMethodID(segmenter_class_, config_.segment_method_name.c_str(), + "(Ljava/lang/String;)[Ljava/lang/String;"); + if (!segment_method_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Failed to find Korean segment method '" + + config_.segment_method_name + "': " + error_msg); + return OBP_PLUGIN_ERROR; + } + + OBP_LOG_INFO("Korean Java classes loaded successfully"); + return OBP_SUCCESS; +} + +int KoreanJNIBridge::do_segment(JNIEnv* env, const std::string& text, std::vector& tokens) { + std::string error_msg; + tokens.clear(); + + // Convert C++ string to Java string + jstring jtext = oceanbase::jni::JNIUtils::cpp_string_to_jstring(env, text); + if (!jtext) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert text to Java string for Korean segmentation"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Call static segment method (no instance creation needed) + jobjectArray jresult = (jobjectArray)env->CallStaticObjectMethod( + segmenter_class_, segment_method_, jtext); + if (oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Korean segmentation failed: " + error_msg); + // Note: jresult is nullptr when exception occurs, no need to clean it + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + if (!jresult) { + set_error(OBP_PLUGIN_ERROR, "Korean segmentation returned null result"); + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + // Convert result to C++ vector (JNIUtils handles its own Frame management) + int ret = oceanbase::jni::JNIUtils::jstring_array_to_cpp_vector(env, jresult, tokens); + if (ret != OBP_SUCCESS) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert Korean segmentation result to C++ vector"); + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + return ret; + } + + // Clean up our local references + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + + // Debug output removed for production use + + return OBP_SUCCESS; +} + +void KoreanJNIBridge::set_error(int code, const std::string& message) { + last_error_code_ = code; + last_error_message_ = message; + // Error logging removed - let upper layer handle error output +} + +void KoreanJNIBridge::clear_error() { + last_error_code_ = OBP_SUCCESS; + last_error_message_.clear(); +} + +// KoreanJNIBridgeManager implementation +KoreanJNIBridgeManager& KoreanJNIBridgeManager::get_instance() { + static KoreanJNIBridgeManager instance; + return instance; +} + +std::shared_ptr KoreanJNIBridgeManager::get_bridge() { + std::lock_guard lock(mutex_); + if (!bridge_) { + bridge_ = std::make_shared(); + } + return bridge_; +} + +int KoreanJNIBridgeManager::initialize() { + auto bridge = get_bridge(); + return bridge ? bridge->initialize() : OBP_PLUGIN_ERROR; +} + +// Plugin parser structure +struct KoreanParserState { + std::vector tokens; + size_t current_token_index; + + KoreanParserState() : current_token_index(0) {} +}; + +} // namespace korean_ftparser +} // namespace oceanbase + +// Plugin interface implementation +extern "C" { + +int korean_ftparser_init(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Don't initialize JVM here - do it lazily on first use (scan_begin) + // This avoids issues with classpath when Observer is starting up + OBP_LOG_INFO("Korean FTParser plugin registered (JVM will be initialized on first use)"); + return OBP_SUCCESS; +} + +int korean_ftparser_deinit(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + OBP_LOG_INFO("Korean FTParser plugin deinitialized"); + return OBP_SUCCESS; +} + +int korean_ftparser_scan_begin(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Lazy initialization: initialize JVM on first actual use + auto& manager = oceanbase::korean_ftparser::KoreanJNIBridgeManager::get_instance(); + int ret = manager.initialize(); + if (ret != OBP_SUCCESS) { + OBP_LOG_WARN("Failed to initialize Korean JNI bridge on first use (error_code: %d)", ret); + return ret; + } + + // Create parser instance + oceanbase::korean_ftparser::KoreanParserState* kp = new (std::nothrow) oceanbase::korean_ftparser::KoreanParserState(); + if (!kp) { + return OBP_ALLOCATE_MEMORY_FAILED; + } + + // Get text to parse + const char* fulltext = obp_ftparser_fulltext(param); + int64_t fulltext_len = obp_ftparser_fulltext_length(param); + + if (!fulltext || fulltext_len <= 0) { + delete kp; + return OBP_INVALID_ARGUMENT; + } + + std::string text(fulltext, fulltext_len); + + // Perform segmentation + auto bridge = manager.get_bridge(); + if (!bridge) { + delete kp; + return OBP_PLUGIN_ERROR; + } + + ret = bridge->segment(text, kp->tokens); + if (ret != OBP_SUCCESS) { + OBP_LOG_WARN("Korean segmentation failed: %s (error_code: %d)", + bridge->get_last_error_message().c_str(), bridge->get_last_error_code()); + delete kp; + return ret; + } + + // Store parser state + obp_ftparser_set_user_data(param, kp); + + OBP_LOG_INFO("Korean segmentation completed: %zu tokens extracted from %zu characters", + kp->tokens.size(), text.length()); + return OBP_SUCCESS; +} + +int korean_ftparser_scan_end(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::korean_ftparser::KoreanParserState* kp = (oceanbase::korean_ftparser::KoreanParserState*)obp_ftparser_user_data(param); + if (kp) { + delete kp; + obp_ftparser_set_user_data(param, nullptr); + } + + return OBP_SUCCESS; +} + +int korean_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq) { + if (!param || !word || !word_len || !char_cnt || !word_freq) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::korean_ftparser::KoreanParserState* kp = (oceanbase::korean_ftparser::KoreanParserState*)obp_ftparser_user_data(param); + if (!kp) { + return OBP_PLUGIN_ERROR; + } + + if (kp->current_token_index >= kp->tokens.size()) { + return OBP_ITER_END; + } + + const std::string& token = kp->tokens[kp->current_token_index++]; + + // Set word properties + *word = const_cast(token.c_str()); + *word_len = token.length(); + + // Calculate character count (UTF-8 character count, not byte count) + // Use the same algorithm as the original plugin + int64_t char_count = 0; + const char* p = token.c_str(); + const char* end = p + token.length(); + + while (p < end) { + unsigned char c = *p; + + if ((c & 0x80) == 0) { + p += 1; // ASCII character + } else if ((c & 0xE0) == 0xC0) { + p += 2; // 2-byte UTF-8 character + } else if ((c & 0xF0) == 0xE0) { + p += 3; // 3-byte UTF-8 character (most CJK characters) + } else if ((c & 0xF8) == 0xF0) { + p += 4; // 4-byte UTF-8 character + } else { + p += 1; // Invalid UTF-8, skip + continue; // Don't count invalid characters + } + + char_count++; + } + + *char_cnt = char_count; // Set character count + *word_freq = 1; // Set word frequency to 1 + + return OBP_SUCCESS; +} + +int korean_ftparser_get_add_word_flag(uint64_t *flag) { + if (!flag) { + return OBP_INVALID_ARGUMENT; + } + + // Set flags for Korean language processing (same as original plugin) + // Disable problematic filters that may reject Korean characters + *flag = OBP_FTPARSER_AWF_CASEDOWN // Convert to lowercase (safe for Korean) + | OBP_FTPARSER_AWF_GROUPBY_WORD; // Group identical words (safe for Korean) + // Disabled: OBP_FTPARSER_AWF_MIN_MAX_WORD (may filter Korean chars by length) + // Disabled: OBP_FTPARSER_AWF_STOPWORD (may use inappropriate stopword list) + + return OBP_SUCCESS; +} + +} // extern "C" diff --git a/korean_ftparser/korean_jni_bridge.h b/korean_ftparser/korean_jni_bridge.h new file mode 100644 index 0000000..b08a722 --- /dev/null +++ b/korean_ftparser/korean_jni_bridge.h @@ -0,0 +1,142 @@ +/** + * Copyright (c) 2023 OceanBase + * Korean Fulltext Parser Plugin + */ + +#pragma once + +#include "oceanbase/ob_plugin_ftparser.h" +#include "jni_manager.h" // Unified JNI management library +#include +#include +#include + +namespace oceanbase { +namespace korean_ftparser { + +/** + * Korean JNI Bridge Configuration + * @brief Plugin-specific configuration for Korean text segmentation + * @details JVM-level configurations are now managed by JNIConfigUtils in common library + */ +struct KoreanJNIBridgeConfig { + // Plugin-specific configurations only + std::string segmenter_class_name; + std::string segment_method_name; + + KoreanJNIBridgeConfig(); +}; + +/** + * Korean JNI Bridge for Korean Segmentation + * @brief Korean parser using the common JNI library + * @details This class uses oceanbase::jni::ScopedJNIEnvironment for + * automatic JVM and thread management, eliminating the need for + * complex manual resource management. + */ +class KoreanJNIBridge { +private: + KoreanJNIBridgeConfig config_; + std::string plugin_name_; + bool is_initialized_; + std::mutex bridge_mutex_; + + // Java class and method references (cached for performance) + jclass segmenter_class_; + jmethodID segment_method_; // Static method + + // Error handling + int last_error_code_; + std::string last_error_message_; + +public: + KoreanJNIBridge(); + + /** + * Destructor - automatically unregisters from global JVM manager + */ + ~KoreanJNIBridge(); + + /** + * Initialize the JNI bridge + * @return OBP_SUCCESS on success, error code on failure + */ + int initialize(); + + /** + * Segment Korean text using Lucene Korean analyzer + * @param text Input text to segment + * @param tokens Output vector to store segmented tokens + * @return OBP_SUCCESS on success, error code on failure + */ + int segment(const std::string& text, std::vector& tokens); + + // Error handling + int get_last_error_code() const { return last_error_code_; } + const std::string& get_last_error_message() const { return last_error_message_; } + +private: + /** + * Load Java classes and cache method IDs + */ + int load_java_classes(JNIEnv* env); + + /** + * Perform actual segmentation using JNI + */ + int do_segment(JNIEnv* env, const std::string& text, std::vector& tokens); + + // Error handling helpers + void set_error(int code, const std::string& message); + void clear_error(); + + // Disable copy and move + KoreanJNIBridge(const KoreanJNIBridge&) = delete; + KoreanJNIBridge& operator=(const KoreanJNIBridge&) = delete; +}; + +/** + * Korean JNI Bridge Manager (Singleton) + * @brief Manages a global Korean JNI bridge instance + */ +class KoreanJNIBridgeManager { +public: + /** + * Get the singleton instance + */ + static KoreanJNIBridgeManager& get_instance(); + + /** + * Get the shared bridge instance + */ + std::shared_ptr get_bridge(); + + /** + * Initialize the bridge (lazy initialization) + */ + int initialize(); + +private: + std::shared_ptr bridge_; + std::mutex mutex_; + + KoreanJNIBridgeManager() = default; + ~KoreanJNIBridgeManager() = default; + + // Disable copy + KoreanJNIBridgeManager(const KoreanJNIBridgeManager&) = delete; + KoreanJNIBridgeManager& operator=(const KoreanJNIBridgeManager&) = delete; +}; + +} // namespace korean_ftparser +} // namespace oceanbase + +// Plugin interface functions (declarations only) +extern "C" { + int korean_ftparser_init(ObPluginParamPtr param); + int korean_ftparser_deinit(ObPluginParamPtr param); + int korean_ftparser_scan_begin(ObPluginFTParserParamPtr param); + int korean_ftparser_scan_end(ObPluginFTParserParamPtr param); + int korean_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq); + int korean_ftparser_get_add_word_flag(uint64_t *flag); +} diff --git a/thai_ftparser/CMakeLists.txt b/thai_ftparser/CMakeLists.txt new file mode 100644 index 0000000..7e37c31 --- /dev/null +++ b/thai_ftparser/CMakeLists.txt @@ -0,0 +1,68 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.22) + +# Set plugin name +SET(PLUGIN_NAME thai_ftparser) + +# Source files +SET(SOURCES + thai_ftparser_main.cpp + thai_jni_bridge.cpp +) + +# Project configuration +PROJECT(${PLUGIN_NAME} + DESCRIPTION "Thai ftparser plugin" + HOMEPAGE_URL "https://open.oceanbase.com/" + LANGUAGES CXX C ASM) + +# Find required packages +FIND_PACKAGE(ObPlugin REQUIRED) +FIND_PACKAGE(JNI REQUIRED COMPONENTS JVM) + +# Add common JNI library as a subdirectory +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common build_common_jni_lib) + +# Use OB_ADD_PLUGIN macro (provided by ObPlugin) +OB_ADD_PLUGIN(${PLUGIN_NAME} + ${SOURCES} +) + +# Add include directories for common JNI library +TARGET_INCLUDE_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + ${JNI_INCLUDE_DIRS} +) + +# Link libraries +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE + ${JNI_LIBRARIES} +) + +# Add link directory for common JNI library +TARGET_LINK_DIRECTORIES(${PLUGIN_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build +) + +# Link the common JNI library +TARGET_LINK_LIBRARIES(${PLUGIN_NAME} PRIVATE oceanbase_jni_common) + +# Set C++ standard and RPATH +SET_TARGET_PROPERTIES(${PLUGIN_NAME} PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_VISIBILITY_PRESET default + # RPATH settings: use $ORIGIN to find libraries in the same directory as the plugin + BUILD_RPATH "\$ORIGIN:${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build" + INSTALL_RPATH "\$ORIGIN" + BUILD_WITH_INSTALL_RPATH FALSE +) + +# Add custom command to ensure common library is built first +ADD_CUSTOM_TARGET(build_common_jni_lib_thai + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common/build --parallel + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../common/liboceanbase_jni_common + COMMENT "Building common JNI library for Thai parser" +) + +# Make plugin depend on common library +ADD_DEPENDENCIES(${PLUGIN_NAME} build_common_jni_lib_thai) diff --git a/thai_ftparser/README.en-US.md b/thai_ftparser/README.en-US.md new file mode 100644 index 0000000..a5d255b --- /dev/null +++ b/thai_ftparser/README.en-US.md @@ -0,0 +1,224 @@ +# OceanBase Thai Fulltext Parser Plugin + +A Thai fulltext parser plugin for OceanBase. It uses JNI bridge to call Java tokenization libraries (integrated with Apache Lucene ThaiAnalyzer). + +## Features + +- ✅ Compatible with OceanBase FTParser interface (`thai_ftparser_main.cpp`) +- ✅ JNI integration for Java Thai tokenization (Lucene ThaiAnalyzer) +- ✅ UTF-8 multi-byte Thai text processing +- ✅ Intelligent stopword filtering +- ✅ Extensible: Java tokenization implementation can be replaced + +## Build + +### Environment Setup + +1. Install basic build tools +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +This command installs the gcc development environment. + +> Skip this step if your environment already has these tools + +2. Install OceanBase Plugin Development Kit +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### Compilation + +```bash +# Choose your workspace directory +cd `your/workspace` +# Download source code +git clone https://github.com/oceanbase/oceanbase-plugins +# Build +cd oceanbase-plugins/thai_ftparser +mkdir build +cd build +cmake .. +make +``` +You will see the libthai_ftparser.so file in the build directory. This is the dynamic library plugin. + +## Quick Start + +### Deployment and Installation + +**Recommended method**: Copy .class files and jar files to corresponding locations separately + +```bash +# 1. Copy plugin dynamic library +cp /path/to/yourplugindirpath/libthai_ftparser.so /path/to/observer/plugin_dir/ + +# 2. Create java directory structure (if not exists) +mkdir -p /path/to/observer/java/lib + +# 3. Copy Lucene dependency libraries +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ + +# 4. Copy Thai segmenter class file +cp java/ThaiSegmenter.class /path/to/observer/java/ + +# 5. Install Java environment +yum install java-1.8.0-openjdk-devel -y + +# 6. Start Observer and load plugin +# Connect to database +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # Example with dify database connection info + +# Set plugin loading in sys tenant +ALTER SYSTEM SET plugins_load='libthai_ftparser.so:on'; + +# Restart Observer to take effect +killall observer +cd /path/to/observer +./bin/observer # Start observer in the observer working directory + +# Verify installation (see below) +``` + +**Multi-plugin Coexistence Notes**: +- If other language parser plugins are already installed, only copy the Thai segmenter .class file +- `lucene-core-8.11.2.jar` and `lucene-analyzers-common-8.11.2.jar` are shared by all plugins +- Thai segmenter uses ThaiAnalyzer, no additional language-specific jar packages needed +- When files already exist, cp command will ask for overwrite confirmation, you can choose to skip + + +> 📖 **Detailed Plugin Usage**: Refer to [OceanBase Plugin Development Kit User Manual](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### Dependency Search Priority + +The plugin automatically searches for Java dependencies in the following priority order: + +1. **Environment Variable** (Highest Priority) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path" + ``` + +2. **Observer Working Directory** (Recommended) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/ThaiSegmenter.class + ``` + +3. **Plugin Relative Path** (Development Environment) + ``` + ./java/lib/lucene-*.jar + ``` + +**Recommendation**: Use method 2 (copy java directory), no need to configure OCEANBASE_PARSER_CLASSPATH for quick experience + +### Installation Verification + +```sql +-- Check if plugin is loaded successfully +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'thai_ftparser'; + +-- Create test table (ensure shell character encoding is UTF-8) +CREATE TABLE t_thai ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER thai_ftparser +); + +-- Insert Thai test data +INSERT INTO t_thai (c1, c2, c3) VALUES +(1, 'สวัสดีครับ', 'สวัสดีครับ ยินดีต้อนรับสู่เว็บไซต์ของเรา'), +(2, 'ขอบคุณครับ', 'ขอบคุณที่เข้ามาเยี่ยมชม'), +(3, 'สอบถาม', 'หากมีคำถามใดๆ กรุณาติดต่อสอบถาม'), +(4, 'ขอบคุณมากครับ', 'ขอบคุณมากครับที่ใช้บริการของเรา'), +(5, 'ยินดีต้อนรับ', 'ยินดีต้อนรับสู่ OceanBase'), +(6, 'สวัสดีครับ', 'สวัสดีครับ ดีใจที่ได้พบกันอีกครั้ง'), +(7, 'เป็นอย่างไรบ้าง', 'เป็นอย่างไรบ้างช่วงนี้'), +(8, 'ไม่มีปัญหา', 'ไม่มีปัญหาใดๆ เลย'), +(9, 'แบบฟอร์มกรอกข้อมูล', 'กรุณากรอกข้อมูลให้ครบถ้วน'), +(10, 'ขอบคุณมาก', 'ขอบคุณมาก หวังว่าจะได้พบกันอีก'), +(11, 'ฐานข้อมูล', 'ระบบจัดการฐานข้อมูล OceanBase'), +(12, 'การประมวลผลภาษา', 'การประมวลผลภาษาธรรมชาติภาษาไทย'), +(13, 'วิทยาการคอมพิวเตอร์', 'วิทยาการคอมพิวเตอร์และวิศวกรรมซอฟต์แวร์'), +(14, 'การเรียนรู้ของเครื่อง', 'การเรียนรู้ของเครื่องและปัญญาประดิษฐ์'), +(15, 'การพัฒนาซอฟต์แวร์', 'วิธีการพัฒนาซอฟต์แวร์'); + +-- Test tokenization functionality +SELECT TOKENIZE('ระบบจัดการฐานข้อมูล','thai_ftparser', '[{"output": "all"}]'); + +-- Test 1: Basic vocabulary search (expected to return c1 = 1, 6) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี' IN NATURAL LANGUAGE MODE); + +-- Test 2: Thank you expressions search (expected to return c1 = 2, 4, 10) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ขอบคุณ' IN NATURAL LANGUAGE MODE); + +-- Test 3: Technical vocabulary search (expected to return c1 = 11) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ฐานข้อมูล' IN NATURAL LANGUAGE MODE); + +-- Test 4: Compound technical terms (expected to return c1 = 12) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ประมวลผล' IN NATURAL LANGUAGE MODE); + +-- Test 5: Academic discipline name (expected to return c1 = 13) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('วิทยาการคอมพิวเตอร์' IN NATURAL LANGUAGE MODE); + +-- Test 6: Artificial intelligence related (expected to return c1 = 14) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ปัญญาประดิษฐ์' IN NATURAL LANGUAGE MODE); + +-- Test 7: Software development (expected to return c1 = 15) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('พัฒนาซอฟต์แวร์' IN NATURAL LANGUAGE MODE); + +-- Test 8: Multi-word combination search +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- Test 9: Greeting expressions search +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- Test 10: System-related vocabulary +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ระบบ' IN NATURAL LANGUAGE MODE); +``` + +## Technical Features + +### Thai Tokenization Characteristics + +The Thai tokenizer is based on Apache Lucene ThaiAnalyzer: + +``` +Input: "การประมวลผลภาษาธรรมชาติ" +Output: ["ประมวล", "ภาษา", "ธรรมชาติ"] + +Features: +- Intelligent segmentation: Automatically identifies Thai word boundaries +- Stopword filtering: Filters common functional words +- UTF-8 support: Complete support for Thai characters +``` + +### Stopword Processing + +The Thai tokenizer includes intelligent stopword filtering: + +- **Filtering targets**: Common prepositions, conjunctions, particles +- **Preserved content**: Substantive vocabulary, technical terms +- **Optimized search**: Improves search relevance + +### Comparison with Other Languages + +| Language | Tokenization Features | Stopword Processing | +|----------|----------------------|-------------------| +| **Thai** | Basic segmentation + stopwords | ✅ Intelligent filtering | +| Japanese | BaseForm + stopwords | ✅ Stem unification | +| Korean | MIXED compound words | ❌ Complete preservation | + +**The Thai tokenizer achieves the optimal balance between simplicity and effectiveness**. diff --git a/thai_ftparser/README.th-TH.md b/thai_ftparser/README.th-TH.md new file mode 100644 index 0000000..1c3991d --- /dev/null +++ b/thai_ftparser/README.th-TH.md @@ -0,0 +1,224 @@ +# ปลั๊กอินวิเคราะห์ข้อความภาษาไทย OceanBase (Thai Fulltext Parser Plugin) + +ปลั๊กอินวิเคราะห์ข้อความภาษาไทยสำหรับ OceanBase ใช้ JNI bridge เพื่อเรียกใช้ไลบรารีการแยกคำภาษาไทย (รวม Apache Lucene ThaiAnalyzer) + +## ฟีเจอร์ + +- ✅ รองรับ OceanBase FTParser interface (`thai_ftparser_main.cpp`) +- ✅ รวม JNI สำหรับการแยกคำภาษาไทยด้วย Java (Lucene ThaiAnalyzer) +- ✅ ประมวลผลข้อความภาษาไทย UTF-8 หลายไบต์ +- ✅ กรองคำหยุดอัจฉริยะ +- ✅ ขยายได้: สามารถเปลี่ยนการใช้งานการแยกคำ Java + +## การคอมไพล์ + +### การเตรียมสภาพแวดล้อม + +1. ติดตั้งเครื่องมือคอมไพล์พื้นฐาน +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +คำสั่งนี้จะติดตั้งสภาพแวดล้อมการพัฒนา gcc + +> หากสภาพแวดล้อมของคุณพร้อมแล้ว สามารถข้ามขั้นตอนนี้ได้ + +2. ติดตั้ง OceanBase Plugin Development Kit +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### การคอมไพล์ + +```bash +# เลือกไดเรกทอรีทำงานของคุณ +cd `your/workspace` +# ดาวน์โหลดซอร์สโค้ด +git clone https://github.com/oceanbase/oceanbase-plugins +# คอมไพล์ +cd oceanbase-plugins/thai_ftparser +mkdir build +cd build +cmake .. +make +``` +คุณจะเห็นไฟล์ libthai_ftparser.so ในไดเรกทอรี build นี่คือปลั๊กอินไลบรารีแบบไดนามิก + +## เริ่มต้นใช้งานอย่างรวดเร็ว + +### การติดตั้งและการใช้งาน + +**วิธีที่แนะนำ**: คัดลอกไฟล์ .class และไฟล์ jar ไปยังตำแหน่งที่เกี่ยวข้องแยกกัน + +```bash +# 1. คัดลอกไลบรารีแบบไดนามิกของปลั๊กอิน +cp /path/to/yourplugindirpath/libthai_ftparser.so /path/to/observer/plugin_dir/ + +# 2. สร้างโครงสร้างไดเรกทอรี java (หากไม่มี) +mkdir -p /path/to/observer/java/lib + +# 3. คัดลอกไลบรารีการพึ่งพา Lucene +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ + +# 4. คัดลอกไฟล์คลาสตัวแยกคำภาษาไทย +cp java/ThaiSegmenter.class /path/to/observer/java/ + +# 5. ติดตั้งสภาพแวดล้อม Java +yum install java-1.8.0-openjdk-devel -y + +# 6. เริ่ม Observer และโหลดปลั๊กอิน +# เชื่อมต่อกับฐานข้อมูล +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # ตัวอย่างข้อมูลการเชื่อมต่อฐานข้อมูล dify + +# ตั้งค่าการโหลดปลั๊กอินใน sys tenant +ALTER SYSTEM SET plugins_load='libthai_ftparser.so:on'; + +# รีสตาร์ท Observer เพื่อให้มีผล +killall observer +cd /path/to/observer +./bin/observer # เริ่ม observer ในไดเรกทอรีทำงาน observer + +# ตรวจสอบการติดตั้ง (ดูด้านล่าง) +``` + +**คำอธิบายการใช้งานร่วมกันหลายปลั๊กอิน**: +- หากติดตั้งปลั๊กอินตัวแยกคำภาษาอื่นแล้ว เพียงคัดลอกไฟล์ .class ของตัวแยกคำภาษาไทย +- `lucene-core-8.11.2.jar` และ `lucene-analyzers-common-8.11.2.jar` ใช้ร่วมกันโดยปลั๊กอินทั้งหมด +- ตัวแยกคำภาษาไทยใช้ ThaiAnalyzer ไม่ต้องการไฟล์ jar เฉพาะภาษาเพิ่มเติม +- เมื่อไฟล์มีอยู่แล้ว คำสั่ง cp จะถามเพื่อยืนยันการเขียนทับ คุณสามารถเลือกข้ามได้ + + +> 📖 **คำอธิบายการใช้ปลั๊กอินโดยละเอียด**: อ้างอิง [OceanBase Plugin Development Kit คู่มือผู้ใช้](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### ลำดับความสำคัญการค้นหาการพึ่งพา + +ปลั๊กอินจะค้นหาการพึ่งพา Java โดยอัตโนมัติตามลำดับความสำคัญต่อไปนี้: + +1. **ตัวแปรสภาพแวดล้อม** (ความสำคัญสูงสุด) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path" + ``` + +2. **ไดเรกทอรีทำงาน Observer** (แนะนำ) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/ThaiSegmenter.class + ``` + +3. **เส้นทางสัมพัทธ์ของปลั๊กอิน** (สภาพแวดล้อมการพัฒนา) + ``` + ./java/lib/lucene-*.jar + ``` + +**คำแนะนำ**: ใช้วิธีที่ 2 (คัดลอกไดเรกทอรี java) ไม่ต้องกำหนดค่า OCEANBASE_PARSER_CLASSPATH เพื่อประสบการณ์ที่รวดเร็ว + +### การตรวจสอบการติดตั้ง + +```sql +-- ตรวจสอบว่าปลั๊กอินโหลดสำเร็จหรือไม่ +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'thai_ftparser'; + +-- สร้างตารางทดสอบ (ตรวจสอบให้แน่ใจว่าการเข้ารหัสอักขระของ shell เป็น UTF-8) +CREATE TABLE t_thai ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER thai_ftparser +); + +-- แทรกข้อมูลทดสอบภาษาไทย +INSERT INTO t_thai (c1, c2, c3) VALUES +(1, 'สวัสดีครับ', 'สวัสดีครับ ยินดีต้อนรับสู่เว็บไซต์ของเรา'), +(2, 'ขอบคุณครับ', 'ขอบคุณที่เข้ามาเยี่ยมชม'), +(3, 'สอบถาม', 'หากมีคำถามใดๆ กรุณาติดต่อสอบถาม'), +(4, 'ขอบคุณมากครับ', 'ขอบคุณมากครับที่ใช้บริการของเรา'), +(5, 'ยินดีต้อนรับ', 'ยินดีต้อนรับสู่ OceanBase'), +(6, 'สวัสดีครับ', 'สวัสดีครับ ดีใจที่ได้พบกันอีกครั้ง'), +(7, 'เป็นอย่างไรบ้าง', 'เป็นอย่างไรบ้างช่วงนี้'), +(8, 'ไม่มีปัญหา', 'ไม่มีปัญหาใดๆ เลย'), +(9, 'แบบฟอร์มกรอกข้อมูล', 'กรุณากรอกข้อมูลให้ครบถ้วน'), +(10, 'ขอบคุณมาก', 'ขอบคุณมาก หวังว่าจะได้พบกันอีก'), +(11, 'ฐานข้อมูล', 'ระบบจัดการฐานข้อมูล OceanBase'), +(12, 'การประมวลผลภาษา', 'การประมวลผลภาษาธรรมชาติภาษาไทย'), +(13, 'วิทยาการคอมพิวเตอร์', 'วิทยาการคอมพิวเตอร์และวิศวกรรมซอฟต์แวร์'), +(14, 'การเรียนรู้ของเครื่อง', 'การเรียนรู้ของเครื่องและปัญญาประดิษฐ์'), +(15, 'การพัฒนาซอฟต์แวร์', 'วิธีการพัฒนาซอฟต์แวร์'); + +-- ทดสอบฟังก์ชันการแยกคำ +SELECT TOKENIZE('ระบบจัดการฐานข้อมูล','thai_ftparser', '[{"output": "all"}]'); + +-- ทดสอบ 1: การค้นหาคำศัพท์พื้นฐาน (คาดว่าจะส่งคืน c1 = 1, 6) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 2: การค้นหาคำขอบคุณ (คาดว่าจะส่งคืน c1 = 2, 4, 10) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ขอบคุณ' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 3: การค้นหาคำศัพท์เทคนิค (คาดว่าจะส่งคืน c1 = 11) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ฐานข้อมูล' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 4: คำศัพท์เทคนิคผสม (คาดว่าจะส่งคืน c1 = 12) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ประมวลผล' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 5: ชื่อสาขาวิชา (คาดว่าจะส่งคืน c1 = 13) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('วิทยาการคอมพิวเตอร์' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 6: เกี่ยวกับปัญญาประดิษฐ์ (คาดว่าจะส่งคืน c1 = 14) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ปัญญาประดิษฐ์' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 7: การพัฒนาซอฟต์แวร์ (คาดว่าจะส่งคืน c1 = 15) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('พัฒนาซอฟต์แวร์' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 8: การค้นหาแบบผสมหลายคำ +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 9: การค้นหาคำทักทาย +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- ทดสอบ 10: คำศัพท์เกี่ยวกับระบบ +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ระบบ' IN NATURAL LANGUAGE MODE); +``` + +## คุณสมบัติทางเทคนิค + +### คุณสมบัติการแยกคำภาษาไทย + +ตัวแยกคำภาษาไทยอิงตาม Apache Lucene ThaiAnalyzer: + +``` +อินพุต: "การประมวลผลภาษาธรรมชาติ" +เอาต์พุต: ["ประมวล", "ภาษา", "ธรรมชาติ"] + +คุณสมบัติ: +- การแยกคำอัจฉริยะ: ระบุขอบเขตคำศัพท์ภาษาไทยโดยอัตโนมัติ +- กรองคำหยุด: กรองคำที่ใช้บ่อย +- รองรับ UTF-8: รองรับอักขระภาษาไทยอย่างสมบูรณ์ +``` + +### การประมวลผลคำหยุด + +ตัวแยกคำภาษาไทยรวมการกรองคำหยุดอัจฉริยะ: + +- **วัตถุประสงค์การกรอง**: คำบุพบท คำสันธาน คำช่วยที่พบบ่อย +- **เนื้อหาที่เก็บไว้**: คำศัพท์สำคัญ คำศัพท์เฉพาะทาง +- **การค้นหาที่เหมาะสม**: เพิ่มความเกี่ยวข้องของการค้นหา + +### เปรียบเทียบกับภาษาอื่น + +| ภาษา | คุณสมบัติการแยกคำ | การประมวลผลคำหยุด | +|------|-------------------|-------------------| +| **ไทย** | การแยกคำพื้นฐาน + คำหยุด | ✅ กรองอัจฉริยะ | +| ญี่ปุ่น | BaseForm + คำหยุด | ✅ การรวมคำต้น | +| เกาหลี | คำผสม MIXED | ❌ เก็บครบถ้วน | + +**ตัวแยกคำภาษาไทยบรรลุความสมดุลที่ดีที่สุดระหว่างความเรียบง่ายและประสิทธิภาพ** diff --git a/thai_ftparser/README.zh-CN.md b/thai_ftparser/README.zh-CN.md new file mode 100644 index 0000000..099169b --- /dev/null +++ b/thai_ftparser/README.zh-CN.md @@ -0,0 +1,223 @@ +# OceanBase 泰语全文解析插件(Thai Fulltext Parser Plugin) + +面向 OceanBase 的泰语全文解析插件。核心使用 JNI 桥接 Java 分词库(已集成 Apache Lucene ThaiAnalyzer)。 + +## 功能特性 + +- ✅ 兼容 OceanBase FTParser 接口(`thai_ftparser_main.cpp`) +- ✅ JNI 集成调用 Java 泰语分词(Lucene ThaiAnalyzer) +- ✅ UTF-8 多字节泰文处理 +- ✅ 可扩展:可替换 Java 分词实现 + +## 编译 + +### 环境准备 + +1. 安装编译基础 +```bash +yum install -y git cmake make glibc-devel glibc-headers gcc gcc-c++ +``` +该命令将会安装gcc开发环境。 + +> 如果你的环境已经具备可以跳过当前步骤 + +2. 安装OceanBase 插件开发套件 +```bash +yum install -y oceanbase-plugin-dev-kit +``` + +### 编译 + +```bash +# 选择一个你自己的工作目录 +cd `your/workspace` +# 下载源码 +git clone https://github.com/oceanbase/oceanbase-plugins +# 编译 +cd oceanbase-plugins/thai_ftparser +mkdir build +cd build +cmake .. +make +``` +你将会在build目录下看到libthai_ftparser.so文件,这个就是动态库插件。 + +## 快速开始 + +### 部署安装 + +**推荐方法**:分别复制.class文件和jar文件到对应位置 + +```bash +# 1. 复制插件动态库 +cp /path/to/yourplugindirpath/libthai_ftparser.so /path/to/observer/plugin_dir/ + +# 2. 创建java目录结构(如果不存在) +mkdir -p /path/to/observer/java/lib + +# 3. 复制Lucene依赖库 +cp java/lib/lucene-core-8.11.2.jar /path/to/observer/java/lib/ +cp java/lib/lucene-analyzers-common-8.11.2.jar /path/to/observer/java/lib/ + +# 4. 复制泰语分词器类文件 +cp java/ThaiSegmenter.class /path/to/observer/java/ + +# 5. 安装Java环境 +yum install java-1.8.0-openjdk-devel -y + +# 6. 启动 Observer 并加载插件 +# 连接数据库 +obclient -h127.0.0.1 -P2881 -uroot@sys -pdifyai123456 # 这里以dify的数据库连接信息为例 + +# 在sys租户中设置插件加载 +ALTER SYSTEM SET plugins_load='libthai_ftparser.so:on'; + +# 重启Observer生效 +killall observer +cd /path/to/observer +./bin/observer # 在observer工作目录执行启动observer + +# 验证安装(见下文) +``` + +**多插件共存说明**: +- 如果已安装其他语言分词插件,只需复制泰语分词器的.class文件 +- `lucene-core-8.11.2.jar` 和 `lucene-analyzers-common-8.11.2.jar` 被所有插件共享 +- 泰语分词器使用ThaiAnalyzer,无需额外的语言专用jar包 +- 文件已存在时,cp命令会询问是否覆盖,可选择跳过 + + +> 📖 **详细插件使用说明**:参考 [OceanBase Plugin Development Kit 用户手册](https://oceanbase.github.io/oceanbase-plugin-dev-kit/user-guide/) + +### 依赖寻找优先级 + +插件按以下优先级自动寻找Java依赖: + +1. ** 环境变量**(最高优先级) + ```bash + export OCEANBASE_PARSER_CLASSPATH="/custom/path/lucene-core-8.11.2.jar:/custom/path/lucene-analyzers-common-8.11.2.jar:/custom/path" + ``` + +2. ** Observer 工作目录**(推荐) + ``` + ${OB_WORKDIR}/java/lib/lucene-core-8.11.2.jar + ${OB_WORKDIR}/java/lib/lucene-analyzers-common-8.11.2.jar + ${OB_WORKDIR}/java/ThaiSegmenter.class + ``` + +3. ** 插件相对路径**(开发环境) + ``` + ./java/lib/lucene-*.jar + ``` + +** 建议**:使用方式2(复制java目录),无需配置OCEANBASE_PARSER_CLASSPATH,快速体验 + +### 验证安装 + +```sql +-- 检查插件是否加载成功 +SELECT * FROM oceanbase.GV$OB_PLUGINS WHERE NAME = 'thai_ftparser'; + +-- 创建测试表(注意shell的字符集编码用UTF-8) +CREATE TABLE t_thai ( + c1 INT, + c2 VARCHAR(200), + c3 TEXT, + FULLTEXT INDEX (c2, c3) WITH PARSER thai_ftparser +); + +-- 插入泰语测试数据 +INSERT INTO t_thai (c1, c2, c3) VALUES +(1, 'สวัสดีครับ', 'สวัสดีครับ ยินดีต้อนรับสู่เว็บไซต์ของเรา'), +(2, 'ขอบคุณครับ', 'ขอบคุณที่เข้ามาเยี่ยมชม'), +(3, 'สอบถาม', 'หากมีคำถามใดๆ กรุณาติดต่อสอบถาม'), +(4, 'ขอบคุณมากครับ', 'ขอบคุณมากครับที่ใช้บริการของเรา'), +(5, 'ยินดีต้อนรับ', 'ยินดีต้อนรับสู่ OceanBase'), +(6, 'สวัสดีครับ', 'สวัสดีครับ ดีใจที่ได้พบกันอีกครั้ง'), +(7, 'เป็นอย่างไรบ้าง', 'เป็นอย่างไรบ้างช่วงนี้'), +(8, 'ไม่มีปัญหา', 'ไม่มีปัญหาใดๆ เลย'), +(9, 'แบบฟอร์มกรอกข้อมูล', 'กรุณากรอกข้อมูลให้ครบถ้วน'), +(10, 'ขอบคุณมาก', 'ขอบคุณมาก หวังว่าจะได้พบกันอีก'), +(11, 'ฐานข้อมูล', 'ระบบจัดการฐานข้อมูล OceanBase'), +(12, 'การประมวลผลภาษา', 'การประมวลผลภาษาธรรมชาติภาษาไทย'), +(13, 'วิทยาการคอมพิวเตอร์', 'วิทยาการคอมพิวเตอร์และวิศวกรรมซอฟต์แวร์'), +(14, 'การเรียนรู้ของเครื่อง', 'การเรียนรู้ของเครื่องและปัญญาประดิษฐ์'), +(15, 'การพัฒนาซอฟต์แวร์', 'วิธีการพัฒนาซอฟต์แวร์'); + +-- 测试分词功能 +SELECT TOKENIZE('ระบบจัดการฐานข้อมูล','thai_ftparser', '[{"output": "all"}]'); + +-- 测试 1:基础词汇搜索(预计返回 c1 = 1, 6) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี' IN NATURAL LANGUAGE MODE); + +-- 测试 2:感谢用语搜索(预计返回 c1 = 2, 4, 10) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ขอบคุณ' IN NATURAL LANGUAGE MODE); + +-- 测试 3:技术词汇搜索(预计返回 c1 = 11) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ฐานข้อมูล' IN NATURAL LANGUAGE MODE); + +-- 测试 4:复合技术词汇(预计返回 c1 = 12) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ประมวลผล' IN NATURAL LANGUAGE MODE); + +-- 测试 5:学科名称(预计返回 c1 = 13) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('วิทยาการคอมพิวเตอร์' IN NATURAL LANGUAGE MODE); + +-- 测试 6:人工智能相关(预计返回 c1 = 14) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ปัญญาประดิษฐ์' IN NATURAL LANGUAGE MODE); + +-- 测试 7:软件开发(预计返回 c1 = 15) +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('พัฒนาซอฟต์แวร์' IN NATURAL LANGUAGE MODE); + +-- 测试 8:多词组合搜索 +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('สวัสดี ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- 测试 9:问候语搜索 +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ยินดีต้อนรับ' IN NATURAL LANGUAGE MODE); + +-- 测试 10:系统相关词汇 +SELECT * FROM t_thai +WHERE MATCH(c2, c3) AGAINST('ระบบ' IN NATURAL LANGUAGE MODE); +``` + +## 技术特性 + +### 泰语分词特点 + +泰语分词器基于 Apache Lucene ThaiAnalyzer: + +``` +输入: "การประมวลผลภาษาธรรมชาติ" +输出: ["ประมวล", "ภาษา", "ธรรมชาติ"] + +特点: +- 智能分词: 自动识别泰语词汇边界 +- 停用词过滤: 过滤常见的功能词 +- UTF-8支持: 完整支持泰语字符 +``` + +### 停用词处理 + +泰语分词器包含智能停用词过滤: + +- **过滤对象**: 常见介词、连词、助词 +- **保留内容**: 实质性词汇、专业术语 +- **优化搜索**: 提高搜索相关性 + +### 与其他语言对比 + +| 语言 | 分词特点 | 停用词处理 | +|------|----------|-----------| +| **泰语** | 基础分词 + 停用词 | ✅ 智能过滤 | +| 日语 | BaseForm + 停用词 | ✅ 词干统一 | +| 韩语 | MIXED复合词 | ❌ 保留完整 | + +**泰语分词器在简洁性和有效性之间取得最佳平衡**。 diff --git a/thai_ftparser/java/ThaiSegmenter.class b/thai_ftparser/java/ThaiSegmenter.class new file mode 100644 index 0000000..8b3b133 Binary files /dev/null and b/thai_ftparser/java/ThaiSegmenter.class differ diff --git a/thai_ftparser/java/ThaiSegmenter.java b/thai_ftparser/java/ThaiSegmenter.java new file mode 100644 index 0000000..4edbff0 --- /dev/null +++ b/thai_ftparser/java/ThaiSegmenter.java @@ -0,0 +1,83 @@ +import java.io.StringReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.th.ThaiAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * Thai Segmenter using static method to reduce memory usage + */ +public class ThaiSegmenter { + // 简单直接的静态初始化 - 只创建一次,绝对保证 + private static final ThaiAnalyzer STATIC_ANALYZER = createAnalyzer(); + + /** + * 创建分析器 - 只在类加载时调用一次 + */ + private static ThaiAnalyzer createAnalyzer() { + try { + System.out.println("ThaiSegmenter: Creating static analyzer (once per JVM)"); + return new ThaiAnalyzer(); + } catch (Exception e) { + System.err.println("Failed to initialize ThaiSegmenter: " + e.getMessage()); + e.printStackTrace(); + throw new RuntimeException("Cannot initialize ThaiAnalyzer", e); + } + } + + /** + * Segment Thai text into tokens + * @param text The input text to segment + * @return Array of segmented tokens + */ + public static String[] segment(String text) { + if (text == null || text.trim().isEmpty()) { + return new String[0]; + } + + List tokens = new ArrayList<>(); + + try (TokenStream tokenStream = STATIC_ANALYZER.tokenStream("content", new StringReader(text))) { + CharTermAttribute termAttr = tokenStream.addAttribute(CharTermAttribute.class); + + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String token = termAttr.toString(); + tokens.add(token); + } + tokenStream.end(); + + } catch (IOException e) { + System.err.println("Error during Thai tokenization: " + e.getMessage()); + return new String[0]; + } + + return tokens.toArray(new String[0]); + } + + /** + * 检查静态分析器是否已初始化 (总是返回true,因为如果初始化失败会抛异常) + */ + public static boolean isStaticInitialized() { + return STATIC_ANALYZER != null; + } + + public static void main(String[] args) { + // Test cases + String[] result1 = ThaiSegmenter.segment("Hello world"); + System.out.println("English: " + java.util.Arrays.toString(result1)); + + String[] result2 = ThaiSegmenter.segment("สวัสดีครับ"); + System.out.println("Thai: " + java.util.Arrays.toString(result2)); + + String[] result3 = ThaiSegmenter.segment("Hello สวัสดี world"); + System.out.println("Mixed: " + java.util.Arrays.toString(result3)); + + String[] result4 = ThaiSegmenter.segment("ฐานข้อมูล OceanBase เป็นระบบจัดการฐานข้อมูล"); + System.out.println("Complex: " + java.util.Arrays.toString(result4)); + } +} \ No newline at end of file diff --git a/thai_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar b/thai_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar new file mode 100644 index 0000000..d0f85d2 Binary files /dev/null and b/thai_ftparser/java/lib/lucene-analyzers-common-8.11.2.jar differ diff --git a/thai_ftparser/java/lib/lucene-core-8.11.2.jar b/thai_ftparser/java/lib/lucene-core-8.11.2.jar new file mode 100644 index 0000000..f91d691 Binary files /dev/null and b/thai_ftparser/java/lib/lucene-core-8.11.2.jar differ diff --git a/thai_ftparser/thai_ftparser_main.cpp b/thai_ftparser/thai_ftparser_main.cpp new file mode 100644 index 0000000..dd6a9e8 --- /dev/null +++ b/thai_ftparser/thai_ftparser_main.cpp @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2023 OceanBase + * Thai Fulltext Parser Plugin - Main Entry Point + */ + +#include "thai_jni_bridge.h" + +extern "C" { + +/** + * Plugin initialization function + * @param plugin Plugin parameter pointer + * @return OBP_SUCCESS on success, error code on failure + */ +int plugin_init_thai(ObPluginParamPtr plugin) +{ + int ret = OBP_SUCCESS; + + if (0 == plugin) { + ret = OBP_INVALID_ARGUMENT; + return ret; + } + + // Define the ftparser plugin descriptor + ObPluginFTParser parser = { + .init = thai_ftparser_init, + .deinit = thai_ftparser_deinit, + .scan_begin = thai_ftparser_scan_begin, + .scan_end = thai_ftparser_scan_end, + .next_token = thai_ftparser_next_token, + .get_add_word_flag = thai_ftparser_get_add_word_flag + }; + + // Register the ftparser plugin with OceanBase + ret = OBP_REGISTER_FTPARSER(plugin, + "thai_ftparser", + parser, + "Thai language fulltext parser"); + + return ret; +} + +// Plugin declaration +OBP_DECLARE_PLUGIN(thai_ftparser) +{ + OBP_AUTHOR_OCEANBASE, // Plugin author + OBP_MAKE_VERSION(0, 0, 1), // Plugin version + OBP_LICENSE_MULAN_PSL_V2, // Plugin license + plugin_init_thai, // Plugin initialization function + nullptr, // Plugin deinitialization function (not needed) +} OBP_DECLARE_PLUGIN_END; + +} // extern "C" diff --git a/thai_ftparser/thai_jni_bridge.cpp b/thai_ftparser/thai_jni_bridge.cpp new file mode 100644 index 0000000..c8524ed --- /dev/null +++ b/thai_ftparser/thai_jni_bridge.cpp @@ -0,0 +1,367 @@ +/** + * Copyright (c) 2023 OceanBase + * Thai Fulltext Parser Plugin + */ + +#include "thai_jni_bridge.h" +#include +#include +#include +#include +#include + + + +using namespace oceanbase::jni; + +namespace oceanbase { +namespace thai_ftparser { + +// Configuration implementation +ThaiJNIBridgeConfig::ThaiJNIBridgeConfig() + : segmenter_class_name("ThaiSegmenter") + , segment_method_name("segment") { + // JVM configurations are now managed by JNIConfigUtils in common library +} + +// ThaiJNIBridge implementation +ThaiJNIBridge::ThaiJNIBridge() + : plugin_name_("thai_ftparser") + , is_initialized_(false) + , segmenter_class_(nullptr) + , segment_method_(nullptr) { + clear_error(); +} + +ThaiJNIBridge::~ThaiJNIBridge() { + if (is_initialized_) { + // Unregister from global JVM manager + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + is_initialized_ = false; + } +} + +int ThaiJNIBridge::initialize() { + std::lock_guard lock(bridge_mutex_); + + if (is_initialized_) { + return OBP_SUCCESS; + } + + clear_error(); + + // Register with global JVM manager + oceanbase::jni::GlobalJVMManager::register_plugin(plugin_name_); + + // Create scoped JNI environment using unified configuration from common library + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for Thai parser initialization"); + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return OBP_PLUGIN_ERROR; + } + + // Load Java classes and cache method IDs + int ret = load_java_classes(jni_env.get()); + if (ret != OBP_SUCCESS) { + oceanbase::jni::GlobalJVMManager::unregister_plugin(plugin_name_); + return ret; + } + + is_initialized_ = true; + OBP_LOG_INFO("Thai JNI Bridge initialized successfully"); + return OBP_SUCCESS; +} + +int ThaiJNIBridge::segment(const std::string& text, std::vector& tokens) { + if (!is_initialized_) { + set_error(OBP_PLUGIN_ERROR, "Thai JNI Bridge not initialized"); + return OBP_PLUGIN_ERROR; + } + + // Create scoped JNI environment for segmentation + oceanbase::jni::ScopedJNIEnvironment jni_env(plugin_name_); + + if (!jni_env) { + set_error(OBP_PLUGIN_ERROR, "Failed to acquire JNI environment for Thai segmentation"); + return OBP_PLUGIN_ERROR; + } + + return do_segment(jni_env.get(), text, tokens); +} + +int ThaiJNIBridge::load_java_classes(JNIEnv* env) { + std::string error_msg; + + // Load Thai segmenter class + segmenter_class_ = env->FindClass(config_.segmenter_class_name.c_str()); + if (!segmenter_class_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Failed to find Thai segmenter class '" + + config_.segmenter_class_name + "': " + error_msg); + return OBP_PLUGIN_ERROR; + } + + // Make it a global reference to prevent GC + segmenter_class_ = (jclass)env->NewGlobalRef(segmenter_class_); + if (!segmenter_class_) { + set_error(OBP_PLUGIN_ERROR, "Failed to create global reference for Thai segmenter class"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Get static segment method ID (no constructor needed) + segment_method_ = env->GetStaticMethodID(segmenter_class_, config_.segment_method_name.c_str(), + "(Ljava/lang/String;)[Ljava/lang/String;"); + if (!segment_method_ || oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Failed to find Thai segment method '" + + config_.segment_method_name + "': " + error_msg); + return OBP_PLUGIN_ERROR; + } + + OBP_LOG_INFO("Thai Java classes loaded successfully"); + return OBP_SUCCESS; +} + +int ThaiJNIBridge::do_segment(JNIEnv* env, const std::string& text, std::vector& tokens) { + std::string error_msg; + tokens.clear(); + + // Convert C++ string to Java string + jstring jtext = oceanbase::jni::JNIUtils::cpp_string_to_jstring(env, text); + if (!jtext) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert text to Java string for Thai segmentation"); + return OBP_PLUGIN_ERROR; + } + + // OPTIMIZED: Call static segment method (no instance creation needed) + jobjectArray jresult = (jobjectArray)env->CallStaticObjectMethod( + segmenter_class_, segment_method_, jtext); + if (oceanbase::jni::JNIUtils::check_and_handle_exception(env, error_msg)) { + set_error(OBP_PLUGIN_ERROR, "Thai segmentation failed: " + error_msg); + // Note: jresult is nullptr when exception occurs, no need to clean it + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + if (!jresult) { + set_error(OBP_PLUGIN_ERROR, "Thai segmentation returned null result"); + env->DeleteLocalRef(jtext); + return OBP_PLUGIN_ERROR; + } + + // Convert result to C++ vector (JNIUtils handles its own Frame management) + int ret = oceanbase::jni::JNIUtils::jstring_array_to_cpp_vector(env, jresult, tokens); + if (ret != OBP_SUCCESS) { + set_error(OBP_PLUGIN_ERROR, "Failed to convert Thai segmentation result to C++ vector"); + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + return ret; + } + + // Clean up our local references + env->DeleteLocalRef(jtext); + env->DeleteLocalRef(jresult); + + // Debug output removed for production use + + return OBP_SUCCESS; +} + +void ThaiJNIBridge::set_error(int code, const std::string& message) { + last_error_code_ = code; + last_error_message_ = message; + // Error logging removed - let upper layer handle error output +} + +void ThaiJNIBridge::clear_error() { + last_error_code_ = OBP_SUCCESS; + last_error_message_.clear(); +} + +// ThaiJNIBridgeManager implementation +ThaiJNIBridgeManager& ThaiJNIBridgeManager::get_instance() { + static ThaiJNIBridgeManager instance; + return instance; +} + +std::shared_ptr ThaiJNIBridgeManager::get_bridge() { + std::lock_guard lock(mutex_); + if (!bridge_) { + bridge_ = std::make_shared(); + } + return bridge_; +} + +int ThaiJNIBridgeManager::initialize() { + auto bridge = get_bridge(); + return bridge ? bridge->initialize() : OBP_PLUGIN_ERROR; +} + +// Plugin parser structure +struct ThaiParserState { + std::vector tokens; + size_t current_token_index; + + ThaiParserState() : current_token_index(0) {} +}; + +} // namespace thai_ftparser +} // namespace oceanbase + +// Plugin interface implementation +extern "C" { + +int thai_ftparser_init(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Don't initialize JVM here - do it lazily on first use (scan_begin) + // This avoids issues with classpath when Observer is starting up + OBP_LOG_INFO("Thai FTParser plugin registered (JVM will be initialized on first use)"); + return OBP_SUCCESS; +} + +int thai_ftparser_deinit(ObPluginParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + OBP_LOG_INFO("Thai FTParser plugin deinitialized"); + return OBP_SUCCESS; +} + +int thai_ftparser_scan_begin(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + // Lazy initialization: initialize JVM on first actual use + auto& manager = oceanbase::thai_ftparser::ThaiJNIBridgeManager::get_instance(); + int ret = manager.initialize(); + if (ret != OBP_SUCCESS) { + OBP_LOG_WARN("Failed to initialize Thai JNI bridge on first use (error_code: %d)", ret); + return ret; + } + + // Create parser instance + oceanbase::thai_ftparser::ThaiParserState* tp = new (std::nothrow) oceanbase::thai_ftparser::ThaiParserState(); + if (!tp) { + return OBP_ALLOCATE_MEMORY_FAILED; + } + + // Get text to parse + const char* fulltext = obp_ftparser_fulltext(param); + int64_t fulltext_len = obp_ftparser_fulltext_length(param); + + if (!fulltext || fulltext_len <= 0) { + delete tp; + return OBP_INVALID_ARGUMENT; + } + + std::string text(fulltext, fulltext_len); + + // Perform segmentation + auto bridge = manager.get_bridge(); + if (!bridge) { + delete tp; + return OBP_PLUGIN_ERROR; + } + + ret = bridge->segment(text, tp->tokens); + if (ret != OBP_SUCCESS) { + OBP_LOG_WARN("Thai segmentation failed: %s (error_code: %d)", + bridge->get_last_error_message().c_str(), bridge->get_last_error_code()); + delete tp; + return ret; + } + + // Store parser state + obp_ftparser_set_user_data(param, tp); + + OBP_LOG_INFO("Thai segmentation completed: %zu tokens extracted from %zu characters", + tp->tokens.size(), text.length()); + return OBP_SUCCESS; +} + +int thai_ftparser_scan_end(ObPluginFTParserParamPtr param) { + if (!param) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::thai_ftparser::ThaiParserState* tp = (oceanbase::thai_ftparser::ThaiParserState*)obp_ftparser_user_data(param); + if (tp) { + delete tp; + obp_ftparser_set_user_data(param, nullptr); + } + + return OBP_SUCCESS; +} + +int thai_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq) { + if (!param || !word || !word_len || !char_cnt || !word_freq) { + return OBP_INVALID_ARGUMENT; + } + + oceanbase::thai_ftparser::ThaiParserState* tp = (oceanbase::thai_ftparser::ThaiParserState*)obp_ftparser_user_data(param); + if (!tp) { + return OBP_PLUGIN_ERROR; + } + + if (tp->current_token_index >= tp->tokens.size()) { + return OBP_ITER_END; + } + + const std::string& token = tp->tokens[tp->current_token_index++]; + + // Set word properties + *word = const_cast(token.c_str()); + *word_len = token.length(); + + // Calculate character count (UTF-8 character count, not byte count) + // Use the same algorithm as the original plugin + int64_t char_count = 0; + const char* p = token.c_str(); + const char* end = p + token.length(); + + while (p < end) { + unsigned char c = *p; + + if ((c & 0x80) == 0) { + p += 1; // ASCII character + } else if ((c & 0xE0) == 0xC0) { + p += 2; // 2-byte UTF-8 character + } else if ((c & 0xF0) == 0xE0) { + p += 3; // 3-byte UTF-8 character (most CJK characters) + } else if ((c & 0xF8) == 0xF0) { + p += 4; // 4-byte UTF-8 character + } else { + p += 1; // Invalid UTF-8, skip + continue; // Don't count invalid characters + } + + char_count++; + } + + *char_cnt = char_count; // Set character count + *word_freq = 1; // Set word frequency to 1 + + return OBP_SUCCESS; +} + +int thai_ftparser_get_add_word_flag(uint64_t *flag) { + if (!flag) { + return OBP_INVALID_ARGUMENT; + } + + // Set flags for Thai language processing (same as original plugin) + // Disable problematic filters that may reject Thai characters + *flag = OBP_FTPARSER_AWF_CASEDOWN // Convert to lowercase (safe for Thai) + | OBP_FTPARSER_AWF_GROUPBY_WORD; // Group identical words (safe for Thai) + // Disabled: OBP_FTPARSER_AWF_MIN_MAX_WORD (may filter Thai chars by length) + // Disabled: OBP_FTPARSER_AWF_STOPWORD (may use inappropriate stopword list) + + return OBP_SUCCESS; +} + +} // extern "C" diff --git a/thai_ftparser/thai_jni_bridge.h b/thai_ftparser/thai_jni_bridge.h new file mode 100644 index 0000000..c8c479b --- /dev/null +++ b/thai_ftparser/thai_jni_bridge.h @@ -0,0 +1,142 @@ +/** + * Copyright (c) 2023 OceanBase + * Thai Fulltext Parser Plugin + */ + +#pragma once + +#include "oceanbase/ob_plugin_ftparser.h" +#include "jni_manager.h" // Unified JNI management library +#include +#include +#include + +namespace oceanbase { +namespace thai_ftparser { + +/** + * Thai JNI Bridge Configuration + * @brief Plugin-specific configuration for Thai text segmentation + * @details JVM-level configurations are now managed by JNIConfigUtils in common library + */ +struct ThaiJNIBridgeConfig { + // Plugin-specific configurations only + std::string segmenter_class_name; + std::string segment_method_name; + + ThaiJNIBridgeConfig(); +}; + +/** + * Thai JNI Bridge for Thai Segmentation + * @brief Thai parser using the common JNI library + * @details This class uses oceanbase::jni::ScopedJNIEnvironment for + * automatic JVM and thread management, eliminating the need for + * complex manual resource management. + */ +class ThaiJNIBridge { +private: + ThaiJNIBridgeConfig config_; + std::string plugin_name_; + bool is_initialized_; + std::mutex bridge_mutex_; + + // Java class and method references (cached for performance) + jclass segmenter_class_; + jmethodID segment_method_; // Static method + + // Error handling + int last_error_code_; + std::string last_error_message_; + +public: + ThaiJNIBridge(); + + /** + * Destructor - automatically unregisters from global JVM manager + */ + ~ThaiJNIBridge(); + + /** + * Initialize the JNI bridge + * @return OBP_SUCCESS on success, error code on failure + */ + int initialize(); + + /** + * Segment Thai text using Lucene Thai analyzer + * @param text Input text to segment + * @param tokens Output vector to store segmented tokens + * @return OBP_SUCCESS on success, error code on failure + */ + int segment(const std::string& text, std::vector& tokens); + + // Error handling + int get_last_error_code() const { return last_error_code_; } + const std::string& get_last_error_message() const { return last_error_message_; } + +private: + /** + * Load Java classes and cache method IDs + */ + int load_java_classes(JNIEnv* env); + + /** + * Perform actual segmentation using JNI + */ + int do_segment(JNIEnv* env, const std::string& text, std::vector& tokens); + + // Error handling helpers + void set_error(int code, const std::string& message); + void clear_error(); + + // Disable copy and move + ThaiJNIBridge(const ThaiJNIBridge&) = delete; + ThaiJNIBridge& operator=(const ThaiJNIBridge&) = delete; +}; + +/** + * Thai JNI Bridge Manager (Singleton) + * @brief Manages a global Thai JNI bridge instance + */ +class ThaiJNIBridgeManager { +public: + /** + * Get the singleton instance + */ + static ThaiJNIBridgeManager& get_instance(); + + /** + * Get the shared bridge instance + */ + std::shared_ptr get_bridge(); + + /** + * Initialize the bridge (lazy initialization) + */ + int initialize(); + +private: + std::shared_ptr bridge_; + std::mutex mutex_; + + ThaiJNIBridgeManager() = default; + ~ThaiJNIBridgeManager() = default; + + // Disable copy + ThaiJNIBridgeManager(const ThaiJNIBridgeManager&) = delete; + ThaiJNIBridgeManager& operator=(const ThaiJNIBridgeManager&) = delete; +}; + +} // namespace thai_ftparser +} // namespace oceanbase + +// Plugin interface functions (declarations only) +extern "C" { + int thai_ftparser_init(ObPluginParamPtr param); + int thai_ftparser_deinit(ObPluginParamPtr param); + int thai_ftparser_scan_begin(ObPluginFTParserParamPtr param); + int thai_ftparser_scan_end(ObPluginFTParserParamPtr param); + int thai_ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq); + int thai_ftparser_get_add_word_flag(uint64_t *flag); +}