From ac79266dbeec90ad50b85d03f746032b2650e17b Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Fri, 30 Jan 2026 20:00:19 +0800
Subject: [PATCH 1/3] =?UTF-8?q?Finish=20T1-1-11:=20gcd=E3=80=81select=5Fsc?=
 =?UTF-8?q?atter=E3=80=81nll=5Floss=E3=80=81glu=E3=80=81gt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/infinicore/ops/gcd.hpp                |  18 +++
 include/infinicore/ops/glu.hpp                |  18 +++
 include/infinicore/ops/gt.hpp                 |  18 +++
 include/infinicore/ops/nll_loss.hpp           |  19 +++
 include/infinicore/ops/select_scatter.hpp     |  18 +++
 python/infinicore/__init__.py                 |   6 +
 python/infinicore/nn/functional/__init__.py   |   4 +
 python/infinicore/nn/functional/glu.py        |  10 ++
 python/infinicore/nn/functional/nll_loss.py   |  39 ++++++
 python/infinicore/ops/gcd.py                  |  15 +++
 python/infinicore/ops/gt.py                   |  16 +++
 python/infinicore/ops/select_scatter.py       |   9 ++
 src/infinicore/ops/gcd/gcd.cc                 |  26 ++++
 src/infinicore/ops/gcd/gcd_cpu.cc             | 107 ++++++++++++++++
 src/infinicore/ops/glu/glu.cc                 |  31 +++++
 src/infinicore/ops/glu/glu_cpu.cc             | 108 ++++++++++++++++
 src/infinicore/ops/gt/gt.cc                   |  21 +++
 src/infinicore/ops/gt/gt_cpu.cc               |  71 +++++++++++
 src/infinicore/ops/nll_loss/nll_loss.cc       |  26 ++++
 src/infinicore/ops/nll_loss/nll_loss_cpu.cc   | 120 ++++++++++++++++++
 .../ops/select_scatter/select_scatter.cc      |  26 ++++
 .../ops/select_scatter/select_scatter_cpu.cc  | 115 +++++++++++++++++
 src/infinicore/pybind11/ops.hpp               |  10 ++
 src/infinicore/pybind11/ops/gcd.hpp           |  21 +++
 src/infinicore/pybind11/ops/glu.hpp           |  21 +++
 src/infinicore/pybind11/ops/gt.hpp            |  19 +++
 src/infinicore/pybind11/ops/nll_loss.hpp      |  29 +++++
 .../pybind11/ops/select_scatter.hpp           |  18 +++
 test/infinicore/ops/gcd.py                    |   6 +-
 test/infinicore/ops/glu.py                    |   6 +-
 test/infinicore/ops/gt.py                     |   6 +-
 test/infinicore/ops/nll_loss.py               |   6 +-
 test/infinicore/ops/select_scatter.py         |   6 +-
 33 files changed, 974 insertions(+), 15 deletions(-)
 create mode 100644 include/infinicore/ops/gcd.hpp
 create mode 100644 include/infinicore/ops/glu.hpp
 create mode 100644 include/infinicore/ops/gt.hpp
 create mode 100644 include/infinicore/ops/nll_loss.hpp
 create mode 100644 include/infinicore/ops/select_scatter.hpp
 create mode 100644 python/infinicore/nn/functional/glu.py
 create mode 100644 python/infinicore/nn/functional/nll_loss.py
 create mode 100644 python/infinicore/ops/gcd.py
 create mode 100644 python/infinicore/ops/gt.py
 create mode 100644 python/infinicore/ops/select_scatter.py
 create mode 100644 src/infinicore/ops/gcd/gcd.cc
 create mode 100644 src/infinicore/ops/gcd/gcd_cpu.cc
 create mode 100644 src/infinicore/ops/glu/glu.cc
 create mode 100644 src/infinicore/ops/glu/glu_cpu.cc
 create mode 100644 src/infinicore/ops/gt/gt.cc
 create mode 100644 src/infinicore/ops/gt/gt_cpu.cc
 create mode 100644 src/infinicore/ops/nll_loss/nll_loss.cc
 create mode 100644 src/infinicore/ops/nll_loss/nll_loss_cpu.cc
 create mode 100644 src/infinicore/ops/select_scatter/select_scatter.cc
 create mode 100644 src/infinicore/ops/select_scatter/select_scatter_cpu.cc
 create mode 100644 src/infinicore/pybind11/ops/gcd.hpp
 create mode 100644 src/infinicore/pybind11/ops/glu.hpp
 create mode 100644 src/infinicore/pybind11/ops/gt.hpp
 create mode 100644 src/infinicore/pybind11/ops/nll_loss.hpp
 create mode 100644 src/infinicore/pybind11/ops/select_scatter.hpp

diff --git a/include/infinicore/ops/gcd.hpp b/include/infinicore/ops/gcd.hpp
new file mode 100644
index 000000000..21608a0ee
--- /dev/null
+++ b/include/infinicore/ops/gcd.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Gcd {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gcd(Tensor input, Tensor other);
+void gcd_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/glu.hpp b/include/infinicore/ops/glu.hpp
new file mode 100644
index 000000000..167b0beb2
--- /dev/null
+++ b/include/infinicore/ops/glu.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Glu {
+public:
+    using schema = void (*)(Tensor, Tensor, int);
+    static void execute(Tensor input, Tensor output, int dim);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor glu(Tensor input, int dim);
+void glu_(Tensor input, Tensor output, int dim);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/gt.hpp b/include/infinicore/ops/gt.hpp
new file mode 100644
index 000000000..a04692eeb
--- /dev/null
+++ b/include/infinicore/ops/gt.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Gt {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gt(Tensor input, Tensor other);
+void gt_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/nll_loss.hpp b/include/infinicore/ops/nll_loss.hpp
new file mode 100644
index 000000000..8feba68cb
--- /dev/null
+++ b/include/infinicore/ops/nll_loss.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+class NLLLoss {
+public:
+    using schema = void (*)(Tensor, Tensor, std::optional<Tensor>, Tensor, int64_t);
+    static void execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor nll_loss(Tensor input, Tensor target, std::optional<Tensor> weight, int64_t ignore_index);
+void nll_loss_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/select_scatter.hpp b/include/infinicore/ops/select_scatter.hpp
new file mode 100644
index 000000000..2e1348f14
--- /dev/null
+++ b/include/infinicore/ops/select_scatter.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class SelectScatter {
+public:
+    using schema = void (*)(Tensor, Tensor, int64_t, int64_t, Tensor);
+    static void execute(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor select_scatter(Tensor input, Tensor src, int64_t dim, int64_t index);
+void select_scatter_(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index c6b01d5aa..16b64ded3 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -54,6 +54,9 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+from infinicore.ops.gcd import gcd
+from infinicore.ops.gt import gt
+from infinicore.ops.select_scatter import select_scatter
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -134,6 +137,9 @@
     "strided_empty",
     "strided_from_blob",
     "zeros",
+    "gcd",
+    "select_scatter",
+    "gt",
 ]
 
 use_ntops = False
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..18908ad1b 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,8 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .nll_loss import nll_loss
+from .glu import glu
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +19,6 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "nll_loss",
+    "glu",
 ]
diff --git a/python/infinicore/nn/functional/glu.py b/python/infinicore/nn/functional/glu.py
new file mode 100644
index 000000000..e9126ad35
--- /dev/null
+++ b/python/infinicore/nn/functional/glu.py
@@ -0,0 +1,10 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def glu(input: Tensor, dim: int = -1) -> Tensor:
+    
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.glu(input, dim)
+
+    return Tensor(_infinicore.glu(input._underlying, dim))
\ No newline at end of file
diff --git a/python/infinicore/nn/functional/nll_loss.py b/python/infinicore/nn/functional/nll_loss.py
new file mode 100644
index 000000000..4c731472b
--- /dev/null
+++ b/python/infinicore/nn/functional/nll_loss.py
@@ -0,0 +1,39 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def nll_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = None,
+    ignore_index: int = -100,
+    reduction: str = "mean",
+    *,
+    out=None,
+) -> Tensor:
+    
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.nll_loss(
+            input, target, weight=weight, ignore_index=ignore_index, reduction=reduction
+        )
+
+    weight_underlying = weight._underlying if weight is not None else None
+
+    if out is None:
+        return Tensor(
+            _infinicore.nll_loss(
+                input._underlying, 
+                target._underlying, 
+                weight_underlying, 
+                ignore_index
+            )
+        )
+    
+    _infinicore.nll_loss_(
+        input._underlying, 
+        target._underlying, 
+        weight_underlying, 
+        out._underlying, 
+        ignore_index
+    )
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/gcd.py b/python/infinicore/ops/gcd.py
new file mode 100644
index 000000000..defc96a52
--- /dev/null
+++ b/python/infinicore/ops/gcd.py
@@ -0,0 +1,15 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def gcd(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Computes the element-wise greatest common divisor (GCD)."""
+    
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.gcd(input, other, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.gcd(input._underlying, other._underlying))
+    
+    _infinicore.gcd_(input._underlying, other._underlying, out._underlying)
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/gt.py b/python/infinicore/ops/gt.py
new file mode 100644
index 000000000..f347df279
--- /dev/null
+++ b/python/infinicore/ops/gt.py
@@ -0,0 +1,16 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def gt(input: Tensor, other: Tensor | float, *, out: Tensor | None = None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.gt(input, other, out=out)
+
+    if isinstance(other, (int, float)):
+        other = Tensor.full(input.shape, other, dtype=input.dtype, device=input.device)
+
+    if out is None:
+        return Tensor(_infinicore.gt(input._underlying, other._underlying))
+    
+    _infinicore.gt_(input._underlying, other._underlying, out._underlying)
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/select_scatter.py b/python/infinicore/ops/select_scatter.py
new file mode 100644
index 000000000..31af019df
--- /dev/null
+++ b/python/infinicore/ops/select_scatter.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def select_scatter(input: Tensor, src: Tensor, dim: int, index: int) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.select_scatter(input, src, dim, index)
+
+    return Tensor(_infinicore.select_scatter(input._underlying, src._underlying, dim, index))
\ No newline at end of file
diff --git a/src/infinicore/ops/gcd/gcd.cc b/src/infinicore/ops/gcd/gcd.cc
new file mode 100644
index 000000000..b2e152d2b
--- /dev/null
+++ b/src/infinicore/ops/gcd/gcd.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/gcd.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Gcd::schema> &Gcd::dispatcher() {
+    static common::OpDispatcher<Gcd::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Gcd::execute(Tensor input, Tensor other, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, other, output);
+}
+
+Tensor gcd(Tensor input, Tensor other) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    gcd_(input, other, output);
+    return output;
+}
+
+void gcd_(Tensor input, Tensor other, Tensor output) {
+    Gcd::execute(input, other, output);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/gcd/gcd_cpu.cc b/src/infinicore/ops/gcd/gcd_cpu.cc
new file mode 100644
index 000000000..ed39adf32
--- /dev/null
+++ b/src/infinicore/ops/gcd/gcd_cpu.cc
@@ -0,0 +1,107 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/gcd.hpp"
+#include <cmath>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::gcd_impl::cpu {
+
+template <typename T>
+T compute_gcd(T a, T b) {
+    return std::gcd(std::abs(a), std::abs(b));
+}
+
+template <typename T>
+void gcd_contiguous(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) {
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        auto a = utils::cast<int64_t>(input_ptr[i]);
+        auto b = utils::cast<int64_t>(other_ptr[i]);
+        output_ptr[i] = utils::cast<T>(compute_gcd(a, b));
+    }
+}
+
+template <typename T>
+void gcd_strided(const T *input_base, const T *other_base, T *output_base,
+                 const std::vector<size_t> &shape,
+                 const std::vector<int64_t> &input_strides,
+                 const std::vector<int64_t> &other_strides,
+                 const std::vector<int64_t> &output_strides) {
+
+    size_t numel = 1;
+    for (auto s : shape) {
+        numel *= s;
+    }
+    int ndim = shape.size();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t input_offset = 0;
+        size_t other_offset = 0;
+        size_t output_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % shape[d];
+            temp_idx /= shape[d];
+
+            input_offset += coord * input_strides[d];
+            other_offset += coord * other_strides[d];
+            output_offset += coord * output_strides[d];
+        }
+
+        auto a = utils::cast<int64_t>(input_base[input_offset]);
+        auto b = utils::cast<int64_t>(other_base[other_offset]);
+
+        output_base[output_offset] = utils::cast<T>(compute_gcd(a, b));
+    }
+}
+
+void calculate(Tensor input, Tensor other, Tensor output) {
+    if (input->shape() != other->shape() || input->shape() != output->shape()) {
+        throw std::runtime_error("GCD CPU implementation requires all tensors to have the same shape.");
+    }
+
+    bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous();
+    auto dtype = input->dtype();
+    size_t numel = input->numel();
+
+    if (dtype == DataType::I64) {
+        if (all_contiguous) {
+            gcd_contiguous<int64_t>(
+                reinterpret_cast<int64_t *>(input->data()),
+                reinterpret_cast<int64_t *>(other->data()),
+                reinterpret_cast<int64_t *>(output->data()), numel);
+        } else {
+            gcd_strided<int64_t>(
+                reinterpret_cast<int64_t *>(input->data()),
+                reinterpret_cast<int64_t *>(other->data()),
+                reinterpret_cast<int64_t *>(output->data()),
+                input->shape(), input->strides(), other->strides(), output->strides());
+        }
+    } else if (dtype == DataType::I32) {
+        if (all_contiguous) {
+            gcd_contiguous<int32_t>(
+                reinterpret_cast<int32_t *>(input->data()),
+                reinterpret_cast<int32_t *>(other->data()),
+                reinterpret_cast<int32_t *>(output->data()), numel);
+        } else {
+            gcd_strided<int32_t>(
+                reinterpret_cast<int32_t *>(input->data()),
+                reinterpret_cast<int32_t *>(other->data()),
+                reinterpret_cast<int32_t *>(output->data()),
+                input->shape(), input->strides(), other->strides(), output->strides());
+        }
+    } else {
+        throw std::runtime_error("GCD only supports I32 and I64 on CPU.");
+    }
+}
+
+static bool registered = []() {
+    Gcd::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::gcd_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/glu/glu.cc b/src/infinicore/ops/glu/glu.cc
new file mode 100644
index 000000000..e0d6e1179
--- /dev/null
+++ b/src/infinicore/ops/glu/glu.cc
@@ -0,0 +1,31 @@
+#include "infinicore/ops/glu.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Glu::schema> &Glu::dispatcher() {
+    static common::OpDispatcher<Glu::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void Glu::execute(Tensor input, Tensor output, int dim) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, output, dim);
+}
+
+Tensor glu(Tensor input, int dim) {
+    if (dim < 0) {
+        dim += input->ndim();
+    }
+    auto out_shape = input->shape();
+    out_shape[dim] /= 2;
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    glu_(input, output, dim);
+    return output;
+}
+
+void glu_(Tensor input, Tensor output, int dim) {
+    Glu::execute(input, output, dim);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/glu/glu_cpu.cc b/src/infinicore/ops/glu/glu_cpu.cc
new file mode 100644
index 000000000..efa136d33
--- /dev/null
+++ b/src/infinicore/ops/glu/glu_cpu.cc
@@ -0,0 +1,108 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/glu.hpp"
+#include <cmath>
+#include <omp.h>
+
+namespace infinicore::op::glu_impl::cpu {
+
+template <typename T>
+inline T sigmoid(T x) {
+    float val = utils::cast<float>(x);
+    float res = 1.0f / (1.0f + std::exp(-val));
+    return utils::cast<T>(res);
+}
+
+template <typename T>
+void calculate_glu_cpu(Tensor input, Tensor output, int dim) {
+    auto in_shape = input->shape();
+    auto in_strides = input->strides();
+    auto out_shape = output->shape();
+    auto out_strides = output->strides();
+    int ndim = input->ndim();
+
+    if (dim < 0) {
+        dim += ndim;
+    }
+
+    size_t M = 1;
+    for (int i = 0; i < dim; ++i) {
+        M *= in_shape[i];
+    }
+    size_t split_size = out_shape[dim];
+    size_t N = 1;
+    for (int i = dim + 1; i < ndim; ++i) {
+        N *= in_shape[i];
+    }
+
+    const T *in_ptr = reinterpret_cast<const T *>(input->data());
+    T *out_ptr = reinterpret_cast<T *>(output->data());
+
+#pragma omp parallel for collapse(2)
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            for (size_t k = 0; k < split_size; ++k) {
+
+                size_t offset_base = i * in_strides[dim] * (split_size * 2) + j * in_strides[ndim - 1];
+                auto get_offset = [&](size_t d_idx, size_t k_val) {
+                    size_t off = 0;
+                    size_t remaining = i;
+                    for (int d = dim - 1; d >= 0; --d) {
+                        off += (remaining % in_shape[d]) * in_strides[d];
+                        remaining /= in_shape[d];
+                    }
+                    off += (k_val)*in_strides[dim];
+                    remaining = j;
+                    for (int d = ndim - 1; d > dim; --d) {
+                        off += (remaining % in_shape[d]) * in_strides[d];
+                        remaining /= in_shape[d];
+                    }
+                    return off;
+                };
+
+                size_t off_a = get_offset(dim, k);
+                size_t off_b = get_offset(dim, k + split_size);
+
+                size_t off_out = 0;
+                size_t rem = i;
+                for (int d = dim - 1; d >= 0; --d) {
+                    off_out += (rem % out_shape[d]) * out_strides[d];
+                    rem /= out_shape[d];
+                }
+                off_out += k * out_strides[dim];
+                rem = j;
+                for (int d = ndim - 1; d > dim; --d) {
+                    off_out += (rem % out_shape[d]) * out_strides[d];
+                    rem /= out_shape[d];
+                }
+
+                T a = in_ptr[off_a];
+                T b = in_ptr[off_b];
+
+                float val_a = utils::cast<float>(a);
+                float sig_b = utils::cast<float>(sigmoid(b));
+                out_ptr[off_out] = utils::cast<T>(val_a * sig_b);
+            }
+        }
+    }
+}
+
+void calculate(Tensor input, Tensor output, int dim) {
+    auto dtype = input->dtype();
+    if (dtype == DataType::F32) {
+        calculate_glu_cpu<float>(input, output, dim);
+    } else if (dtype == DataType::F16) {
+        calculate_glu_cpu<fp16_t>(input, output, dim);
+    } else if (dtype == DataType::BF16) {
+        calculate_glu_cpu<bf16_t>(input, output, dim);
+    } else {
+        throw std::runtime_error("GLU unsupported dtype: " + toString(dtype));
+    }
+}
+
+static bool registered = []() {
+    Glu::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::glu_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/gt/gt.cc b/src/infinicore/ops/gt/gt.cc
new file mode 100644
index 000000000..feec12ea1
--- /dev/null
+++ b/src/infinicore/ops/gt/gt.cc
@@ -0,0 +1,21 @@
+#include "infinicore/ops/gt.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+common::OpDispatcher<Gt::schema> &Gt::dispatcher() {
+    static common::OpDispatcher<Gt::schema> dispatcher_;
+    return dispatcher_;
+}
+void Gt::execute(Tensor input, Tensor other, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, other, output);
+}
+Tensor gt(Tensor input, Tensor other) {
+    auto output = Tensor::empty(input->shape(), DataType::BOOL, input->device());
+    gt_(input, other, output);
+    return output;
+}
+void gt_(Tensor input, Tensor other, Tensor output) {
+    Gt::execute(input, other, output);
+}
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/gt/gt_cpu.cc b/src/infinicore/ops/gt/gt_cpu.cc
new file mode 100644
index 000000000..ffecf9476
--- /dev/null
+++ b/src/infinicore/ops/gt/gt_cpu.cc
@@ -0,0 +1,71 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/gt.hpp"
+#include <omp.h>
+
+namespace infinicore::op::gt_impl::cpu {
+
+template <typename T>
+void calculate_gt_cpu(Tensor input, Tensor other, Tensor output) {
+    auto in_ptr = reinterpret_cast<const T *>(input->data());
+    auto other_ptr = reinterpret_cast<const T *>(other->data());
+    auto out_base = output->data();
+
+    auto shape = input->shape();
+    auto in_strides = input->strides();
+    auto other_strides = other->strides();
+    auto out_strides = output->strides();
+    auto out_dtype = output->dtype();
+    int ndim = input->ndim();
+    size_t numel = input->numel();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t in_off = 0;
+        size_t other_off = 0;
+        size_t out_off = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % shape[d];
+            temp_idx /= shape[d];
+            in_off += coord * in_strides[d];
+            other_off += coord * other_strides[d];
+            out_off += coord * out_strides[d];
+        }
+
+        bool result = utils::cast<float>(in_ptr[in_off]) > utils::cast<float>(other_ptr[other_off]);
+
+        if (out_dtype == DataType::BOOL) {
+            *(reinterpret_cast<bool *>(out_base + out_off)) = result;
+        } else if (out_dtype == DataType::F32) {
+            *(reinterpret_cast<float *>(out_base + out_off * sizeof(float))) = result ? 1.0f : 0.0f;
+        } else if (out_dtype == DataType::F16) {
+            *(reinterpret_cast<fp16_t *>(out_base + out_off * sizeof(fp16_t))) = utils::cast<fp16_t>(result ? 1.0f : 0.0f);
+        } else if (out_dtype == DataType::I32) {
+            *(reinterpret_cast<int32_t *>(out_base + out_off * sizeof(int32_t))) = result ? 1 : 0;
+        }
+    }
+}
+
+void calculate(Tensor input, Tensor other, Tensor output) {
+    auto dtype = input->dtype();
+    if (dtype == DataType::F32) {
+        calculate_gt_cpu<float>(input, other, output);
+    } else if (dtype == DataType::F16) {
+        calculate_gt_cpu<fp16_t>(input, other, output);
+    } else if (dtype == DataType::BF16) {
+        calculate_gt_cpu<bf16_t>(input, other, output);
+    } else if (dtype == DataType::I32) {
+        calculate_gt_cpu<int32_t>(input, other, output);
+    } else {
+        throw std::runtime_error("GT unsupported dtype");
+    }
+}
+
+static bool registered = []() {
+    Gt::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::gt_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/nll_loss/nll_loss.cc b/src/infinicore/ops/nll_loss/nll_loss.cc
new file mode 100644
index 000000000..35c413b8c
--- /dev/null
+++ b/src/infinicore/ops/nll_loss/nll_loss.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/nll_loss.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<NLLLoss::schema> &NLLLoss::dispatcher() {
+    static common::OpDispatcher<NLLLoss::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void NLLLoss::execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, target, weight, output, ignore_index);
+}
+
+Tensor nll_loss(Tensor input, Tensor target, std::optional<Tensor> weight, int64_t ignore_index) {
+    auto output = Tensor::empty({}, input->dtype(), input->device());
+    nll_loss_(input, target, weight, output, ignore_index);
+    return output;
+}
+
+void nll_loss_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index) {
+    NLLLoss::execute(input, target, weight, output, ignore_index);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/nll_loss/nll_loss_cpu.cc b/src/infinicore/ops/nll_loss/nll_loss_cpu.cc
new file mode 100644
index 000000000..23d81d0a3
--- /dev/null
+++ b/src/infinicore/ops/nll_loss/nll_loss_cpu.cc
@@ -0,0 +1,120 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/nll_loss.hpp"
+#include <cmath>
+#include <cstring>
+#include <omp.h>
+#include <optional>
+#include <vector>
+
+namespace infinicore::op::nll_loss_impl::cpu {
+
+inline float bf16_to_f32(uint16_t val) {
+    uint32_t bits = static_cast<uint32_t>(val) << 16;
+    float f;
+    std::memcpy(&f, &bits, sizeof(f));
+    return f;
+}
+
+template <typename T, typename TargetT = int64_t>
+void nll_loss_kernel(const Tensor &input, const Tensor &target, std::optional<Tensor> weight, Tensor &output, int64_t ignore_index) {
+
+    const void *input_raw = input->data();
+    const TargetT *target_data = reinterpret_cast<const TargetT *>(target->data());
+    T *output_data = reinterpret_cast<T *>(output->data());
+    const void *weight_raw = nullptr;
+
+    if (weight.has_value() && weight.value()) {
+        weight_raw = weight.value()->data();
+    }
+
+    auto input_strides = input->strides();
+    size_t batch_size = input->shape()[0];
+    size_t n_classes = input->shape()[1];
+
+    int64_t input_stride_n = input_strides[0];
+    int64_t input_stride_c = input_strides[1];
+    int64_t target_stride = target->strides()[0];
+    int64_t weight_stride = (weight.has_value() && weight.value()) ? weight.value()->strides()[0] : 0;
+
+    auto dtype = input->dtype();
+    double total_loss = 0.0;
+    double total_weight = 0.0;
+
+#pragma omp parallel for reduction(+ : total_loss, total_weight)
+    for (size_t i = 0; i < batch_size; ++i) {
+        TargetT t = target_data[i * target_stride];
+
+        if (t == ignore_index) {
+            continue;
+        }
+
+        if (t < 0 || t >= static_cast<TargetT>(n_classes)) {
+            continue;
+        }
+
+        double w_val = 1.0;
+        if (weight_raw) {
+            if (dtype == DataType::BF16) {
+                const uint16_t *w_ptr = reinterpret_cast<const uint16_t *>(weight_raw);
+                w_val = static_cast<double>(bf16_to_f32(w_ptr[t * weight_stride]));
+            } else {
+                const T *w_ptr = reinterpret_cast<const T *>(weight_raw);
+                w_val = utils::cast<double>(w_ptr[t * weight_stride]);
+            }
+        }
+
+        size_t offset = i * input_stride_n + t * input_stride_c;
+        double logit_val = 0.0;
+
+        if (dtype == DataType::BF16) {
+            const uint16_t *in_ptr = reinterpret_cast<const uint16_t *>(input_raw);
+            logit_val = static_cast<double>(bf16_to_f32(in_ptr[offset]));
+        } else {
+            const T *in_ptr = reinterpret_cast<const T *>(input_raw);
+            logit_val = utils::cast<double>(in_ptr[offset]);
+        }
+
+        total_loss += (-logit_val * w_val);
+        total_weight += w_val;
+    }
+
+    if (total_weight > 0) {
+        float res_f = static_cast<float>(total_loss / total_weight);
+        if (dtype == DataType::BF16) {
+            uint32_t bits;
+            std::memcpy(&bits, &res_f, sizeof(bits));
+            uint16_t bf16_val = static_cast<uint16_t>(bits >> 16);
+            *reinterpret_cast<uint16_t *>(output_data) = bf16_val;
+        } else {
+            *output_data = utils::cast<T>(res_f);
+        }
+    } else {
+        if (dtype == DataType::BF16) {
+            *reinterpret_cast<uint16_t *>(output_data) = 0;
+        } else {
+            *output_data = utils::cast<T>(0.0f);
+        }
+    }
+}
+
+void calculate(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index) {
+    auto dtype = input->dtype();
+
+    if (dtype == DataType::F32) {
+        nll_loss_kernel<float>(input, target, weight, output, ignore_index);
+    } else if (dtype == DataType::F16) {
+        nll_loss_kernel<fp16_t>(input, target, weight, output, ignore_index);
+    } else if (dtype == DataType::BF16) {
+        nll_loss_kernel<uint16_t>(input, target, weight, output, ignore_index);
+    } else {
+        throw std::runtime_error("Unsupported dtype for nll_loss");
+    }
+}
+
+static bool registered = []() {
+    NLLLoss::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::nll_loss_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/select_scatter/select_scatter.cc b/src/infinicore/ops/select_scatter/select_scatter.cc
new file mode 100644
index 000000000..02feb666f
--- /dev/null
+++ b/src/infinicore/ops/select_scatter/select_scatter.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/select_scatter.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<SelectScatter::schema> &SelectScatter::dispatcher() {
+    static common::OpDispatcher<SelectScatter::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void SelectScatter::execute(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, src, dim, index, output);
+}
+
+Tensor select_scatter(Tensor input, Tensor src, int64_t dim, int64_t index) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    select_scatter_(input, src, dim, index, output);
+    return output;
+}
+
+void select_scatter_(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output) {
+    SelectScatter::execute(input, src, dim, index, output);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/select_scatter/select_scatter_cpu.cc b/src/infinicore/ops/select_scatter/select_scatter_cpu.cc
new file mode 100644
index 000000000..97b8196e4
--- /dev/null
+++ b/src/infinicore/ops/select_scatter/select_scatter_cpu.cc
@@ -0,0 +1,115 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/select_scatter.hpp"
+#include <cstring>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::select_scatter_impl::cpu {
+
+template <typename T>
+void copy_kernel(T *dst_ptr, const std::vector<size_t> &dst_shape, const std::vector<int64_t> &dst_strides,
+                 const T *src_ptr, const std::vector<size_t> &src_shape, const std::vector<int64_t> &src_strides) {
+
+    size_t numel = 1;
+    for (auto s : dst_shape) {
+        numel *= s;
+    }
+    int ndim = dst_shape.size();
+
+    std::vector<int64_t> effective_src_strides = src_strides;
+    for (int i = 0; i < ndim; ++i) {
+        if (src_shape[i] == 1 && dst_shape[i] > 1) {
+            effective_src_strides[i] = 0;
+        }
+    }
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t dst_offset = 0;
+        size_t src_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % dst_shape[d];
+            temp_idx /= dst_shape[d];
+
+            dst_offset += coord * dst_strides[d];
+            src_offset += coord * effective_src_strides[d];
+        }
+
+        dst_ptr[dst_offset] = utils::cast<T>(src_ptr[src_offset]);
+    }
+}
+
+void calculate(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output) {
+    auto ndim = input->ndim();
+    if (dim < 0) {
+        dim += ndim;
+    }
+    if (index < 0) {
+        index += input->shape()[dim];
+    }
+
+    size_t total_numel = input->numel();
+    auto dtype = input->dtype();
+
+    if (input->is_contiguous() && output->is_contiguous() && input->dtype() == output->dtype()) {
+        memcpy(output->data(), input->data(), total_numel * input->element_size());
+    } else {
+
+        if (dtype == DataType::F32) {
+            copy_kernel<float>(
+                reinterpret_cast<float *>(output->data()), output->shape(), output->strides(),
+                reinterpret_cast<float *>(input->data()), input->shape(), input->strides());
+        } else if (dtype == DataType::F16) {
+            copy_kernel<fp16_t>(
+                reinterpret_cast<fp16_t *>(output->data()), output->shape(), output->strides(),
+                reinterpret_cast<fp16_t *>(input->data()), input->shape(), input->strides());
+        } else if (dtype == DataType::BF16) {
+            copy_kernel<bf16_t>(
+                reinterpret_cast<bf16_t *>(output->data()), output->shape(), output->strides(),
+                reinterpret_cast<bf16_t *>(input->data()), input->shape(), input->strides());
+        }
+    }
+
+    std::vector<size_t> slice_shape = input->shape();
+    slice_shape[dim] = 1;
+
+    std::vector<int64_t> slice_strides = output->strides();
+
+    size_t slice_offset_bytes = index * slice_strides[dim] * output->element_size();
+
+    void *slice_data_ptr = reinterpret_cast<char *>(output->data()) + slice_offset_bytes;
+
+    std::vector<size_t> virtual_src_shape = src->shape();
+    std::vector<int64_t> virtual_src_strides = src->strides();
+
+    if (virtual_src_shape.size() == ndim - 1) {
+        virtual_src_shape.insert(virtual_src_shape.begin() + dim, 1);
+        virtual_src_strides.insert(virtual_src_strides.begin() + dim, 0);
+    }
+
+    if (dtype == DataType::F32) {
+        copy_kernel<float>(
+            reinterpret_cast<float *>(slice_data_ptr), slice_shape, slice_strides,
+            reinterpret_cast<float *>(src->data()), virtual_src_shape, virtual_src_strides);
+    } else if (dtype == DataType::F16) {
+        copy_kernel<fp16_t>(
+            reinterpret_cast<fp16_t *>(slice_data_ptr), slice_shape, slice_strides,
+            reinterpret_cast<fp16_t *>(src->data()), virtual_src_shape, virtual_src_strides);
+    } else if (dtype == DataType::BF16) {
+        copy_kernel<bf16_t>(
+            reinterpret_cast<bf16_t *>(slice_data_ptr), slice_shape, slice_strides,
+            reinterpret_cast<bf16_t *>(src->data()), virtual_src_shape, virtual_src_strides);
+    } else {
+        throw std::runtime_error("Unsupported dtype for select_scatter");
+    }
+}
+
+static bool registered = []() {
+    SelectScatter::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::select_scatter_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 3d6ebe79a..5cdd05637 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -7,9 +7,13 @@
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
+#include "ops/gcd.hpp"
+#include "ops/glu.hpp"
+#include "ops/gt.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
+#include "ops/nll_loss.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
@@ -17,6 +21,7 @@
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
+#include "ops/select_scatter.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
 
@@ -42,6 +47,11 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_gcd(m);
+    bind_select_scatter(m);
+    bind_nll_loss(m);
+    bind_glu(m);
+    bind_gt(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/gcd.hpp b/src/infinicore/pybind11/ops/gcd.hpp
new file mode 100644
index 000000000..16bcfcaa1
--- /dev/null
+++ b/src/infinicore/pybind11/ops/gcd.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "infinicore/ops/gcd.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_gcd(py::module &m) {
+    m.def("gcd",
+          &op::gcd,
+          py::arg("input"),
+          py::arg("other"));
+
+    m.def("gcd_",
+          &op::gcd_,
+          py::arg("input"),
+          py::arg("other"),
+          py::arg("output"));
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/glu.hpp b/src/infinicore/pybind11/ops/glu.hpp
new file mode 100644
index 000000000..569ad7d6d
--- /dev/null
+++ b/src/infinicore/pybind11/ops/glu.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "infinicore/ops/glu.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_glu(py::module &m) {
+    m.def("glu",
+          &op::glu,
+          py::arg("input"),
+          py::arg("dim") = -1);
+
+    m.def("glu_",
+          &op::glu_,
+          py::arg("input"),
+          py::arg("output"),
+          py::arg("dim") = -1);
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/gt.hpp b/src/infinicore/pybind11/ops/gt.hpp
new file mode 100644
index 000000000..44c375c2f
--- /dev/null
+++ b/src/infinicore/pybind11/ops/gt.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "infinicore/ops/gt.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+inline void bind_gt(py::module &m) {
+    m.def("gt",
+          &op::gt,
+          py::arg("input"),
+          py::arg("other"));
+
+    m.def("gt_",
+          &op::gt_,
+          py::arg("input"),
+          py::arg("other"),
+          py::arg("output"));
+}
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/nll_loss.hpp b/src/infinicore/pybind11/ops/nll_loss.hpp
new file mode 100644
index 000000000..f31827dfe
--- /dev/null
+++ b/src/infinicore/pybind11/ops/nll_loss.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "infinicore/ops/nll_loss.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_nll_loss(py::module &m) {
+    m.def("nll_loss",
+          &op::nll_loss,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight") = py::none(),
+          py::arg("ignore_index") = -100,
+          R"doc(Calculates NLL Loss.)doc");
+
+    m.def("nll_loss_",
+          &op::nll_loss_,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight"),
+          py::arg("output"),
+          py::arg("ignore_index") = -100,
+          R"doc(In-place NLL Loss calculation.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/select_scatter.hpp b/src/infinicore/pybind11/ops/select_scatter.hpp
new file mode 100644
index 000000000..0d6aeb07e
--- /dev/null
+++ b/src/infinicore/pybind11/ops/select_scatter.hpp
@@ -0,0 +1,18 @@
+#pragma once
+#include "infinicore/ops/select_scatter.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_select_scatter(py::module &m) {
+
+    m.def("select_scatter",
+          &op::select_scatter,
+          py::arg("input"),
+          py::arg("src"),
+          py::arg("dim"),
+          py::arg("index"));
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/test/infinicore/ops/gcd.py b/test/infinicore/ops/gcd.py
index 9484511f8..6ded9390f 100644
--- a/test/infinicore/ops/gcd.py
+++ b/test/infinicore/ops/gcd.py
@@ -92,9 +92,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.gcd(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.gcd(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.gcd(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/glu.py b/test/infinicore/ops/glu.py
index f53f3097d..292c68b4a 100644
--- a/test/infinicore/ops/glu.py
+++ b/test/infinicore/ops/glu.py
@@ -76,9 +76,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.glu(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.glu(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.glu(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/gt.py b/test/infinicore/ops/gt.py
index b4809493e..959737bbf 100644
--- a/test/infinicore/ops/gt.py
+++ b/test/infinicore/ops/gt.py
@@ -111,9 +111,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.gt(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.gt(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.gt(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/nll_loss.py b/test/infinicore/ops/nll_loss.py
index 03291ae4b..9cb705ddc 100644
--- a/test/infinicore/ops/nll_loss.py
+++ b/test/infinicore/ops/nll_loss.py
@@ -87,9 +87,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.nll_loss(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.nll_loss(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.nll_loss(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/select_scatter.py b/test/infinicore/ops/select_scatter.py
index 12375ce15..5118b590c 100644
--- a/test/infinicore/ops/select_scatter.py
+++ b/test/infinicore/ops/select_scatter.py
@@ -91,9 +91,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.select_scatter(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.select_scatter(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.select_scatter(*args, **kwargs)
 
 
 def main():

From c2500041788077e374cb911386766fd05e3d5d5b Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Mon, 2 Feb 2026 14:43:16 +0800
Subject: [PATCH 2/3] fromat

---
 python/infinicore/__init__.py               | 6 +++---
 python/infinicore/nn/functional/__init__.py | 4 ++--
 python/infinicore/nn/functional/glu.py      | 1 +
 python/infinicore/nn/functional/nll_loss.py | 1 +
 python/infinicore/ops/gcd.py                | 1 +
 python/infinicore/ops/gt.py                 | 1 +
 python/infinicore/ops/select_scatter.py     | 1 +
 7 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 16b64ded3..0ec5b148c 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -45,6 +45,8 @@
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
 from infinicore.ops.attention import attention
+from infinicore.ops.gcd import gcd
+from infinicore.ops.gt import gt
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
@@ -52,11 +54,9 @@
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.select_scatter import select_scatter
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
-from infinicore.ops.gcd import gcd
-from infinicore.ops.gt import gt
-from infinicore.ops.select_scatter import select_scatter
 from infinicore.tensor import (
     Tensor,
     empty,
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 18908ad1b..1c71f285b 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -1,13 +1,13 @@
 from .causal_softmax import causal_softmax
 from .embedding import embedding
+from .glu import glu
 from .linear import linear
+from .nll_loss import nll_loss
 from .random_sample import random_sample
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
-from .nll_loss import nll_loss
-from .glu import glu
 
 __all__ = [
     "causal_softmax",
diff --git a/python/infinicore/nn/functional/glu.py b/python/infinicore/nn/functional/glu.py
index e9126ad35..96580743c 100644
--- a/python/infinicore/nn/functional/glu.py
+++ b/python/infinicore/nn/functional/glu.py
@@ -2,6 +2,7 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def glu(input: Tensor, dim: int = -1) -> Tensor:
     
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
diff --git a/python/infinicore/nn/functional/nll_loss.py b/python/infinicore/nn/functional/nll_loss.py
index 4c731472b..97ea6d494 100644
--- a/python/infinicore/nn/functional/nll_loss.py
+++ b/python/infinicore/nn/functional/nll_loss.py
@@ -2,6 +2,7 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def nll_loss(
     input: Tensor,
     target: Tensor,
diff --git a/python/infinicore/ops/gcd.py b/python/infinicore/ops/gcd.py
index defc96a52..aefb44e7d 100644
--- a/python/infinicore/ops/gcd.py
+++ b/python/infinicore/ops/gcd.py
@@ -2,6 +2,7 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def gcd(input: Tensor, other: Tensor, *, out=None) -> Tensor:
     r"""Computes the element-wise greatest common divisor (GCD)."""
     
diff --git a/python/infinicore/ops/gt.py b/python/infinicore/ops/gt.py
index f347df279..a628e1709 100644
--- a/python/infinicore/ops/gt.py
+++ b/python/infinicore/ops/gt.py
@@ -2,6 +2,7 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def gt(input: Tensor, other: Tensor | float, *, out: Tensor | None = None) -> Tensor:
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.gt(input, other, out=out)
diff --git a/python/infinicore/ops/select_scatter.py b/python/infinicore/ops/select_scatter.py
index 31af019df..4a449eab3 100644
--- a/python/infinicore/ops/select_scatter.py
+++ b/python/infinicore/ops/select_scatter.py
@@ -2,6 +2,7 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def select_scatter(input: Tensor, src: Tensor, dim: int, index: int) -> Tensor:
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.select_scatter(input, src, dim, index)

From d5e9c757d1a5a525b389966f805bd360741dba06 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Mon, 2 Feb 2026 14:50:50 +0800
Subject: [PATCH 3/3] format

---
 python/infinicore/nn/functional/glu.py      |  3 +--
 python/infinicore/nn/functional/nll_loss.py | 20 ++++++++------------
 python/infinicore/ops/gcd.py                |  6 +++---
 python/infinicore/ops/gt.py                 |  4 ++--
 python/infinicore/ops/select_scatter.py     |  4 +++-
 5 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/python/infinicore/nn/functional/glu.py b/python/infinicore/nn/functional/glu.py
index 96580743c..08f40ee11 100644
--- a/python/infinicore/nn/functional/glu.py
+++ b/python/infinicore/nn/functional/glu.py
@@ -4,8 +4,7 @@
 
 
 def glu(input: Tensor, dim: int = -1) -> Tensor:
-    
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.glu(input, dim)
 
-    return Tensor(_infinicore.glu(input._underlying, dim))
\ No newline at end of file
+    return Tensor(_infinicore.glu(input._underlying, dim))
diff --git a/python/infinicore/nn/functional/nll_loss.py b/python/infinicore/nn/functional/nll_loss.py
index 97ea6d494..0ba421adf 100644
--- a/python/infinicore/nn/functional/nll_loss.py
+++ b/python/infinicore/nn/functional/nll_loss.py
@@ -12,7 +12,6 @@ def nll_loss(
     *,
     out=None,
 ) -> Tensor:
-    
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.nll_loss(
             input, target, weight=weight, ignore_index=ignore_index, reduction=reduction
@@ -23,18 +22,15 @@ def nll_loss(
     if out is None:
         return Tensor(
             _infinicore.nll_loss(
-                input._underlying, 
-                target._underlying, 
-                weight_underlying, 
-                ignore_index
+                input._underlying, target._underlying, weight_underlying, ignore_index
             )
         )
-    
+
     _infinicore.nll_loss_(
-        input._underlying, 
-        target._underlying, 
-        weight_underlying, 
-        out._underlying, 
-        ignore_index
+        input._underlying,
+        target._underlying,
+        weight_underlying,
+        out._underlying,
+        ignore_index,
     )
-    return out
\ No newline at end of file
+    return out
diff --git a/python/infinicore/ops/gcd.py b/python/infinicore/ops/gcd.py
index aefb44e7d..dc9b54e3d 100644
--- a/python/infinicore/ops/gcd.py
+++ b/python/infinicore/ops/gcd.py
@@ -5,12 +5,12 @@
 
 def gcd(input: Tensor, other: Tensor, *, out=None) -> Tensor:
     r"""Computes the element-wise greatest common divisor (GCD)."""
-    
+
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.gcd(input, other, out=out)
 
     if out is None:
         return Tensor(_infinicore.gcd(input._underlying, other._underlying))
-    
+
     _infinicore.gcd_(input._underlying, other._underlying, out._underlying)
-    return out
\ No newline at end of file
+    return out
diff --git a/python/infinicore/ops/gt.py b/python/infinicore/ops/gt.py
index a628e1709..afe93645d 100644
--- a/python/infinicore/ops/gt.py
+++ b/python/infinicore/ops/gt.py
@@ -12,6 +12,6 @@ def gt(input: Tensor, other: Tensor | float, *, out: Tensor | None = None) -> Te
 
     if out is None:
         return Tensor(_infinicore.gt(input._underlying, other._underlying))
-    
+
     _infinicore.gt_(input._underlying, other._underlying, out._underlying)
-    return out
\ No newline at end of file
+    return out
diff --git a/python/infinicore/ops/select_scatter.py b/python/infinicore/ops/select_scatter.py
index 4a449eab3..87b5b420a 100644
--- a/python/infinicore/ops/select_scatter.py
+++ b/python/infinicore/ops/select_scatter.py
@@ -7,4 +7,6 @@ def select_scatter(input: Tensor, src: Tensor, dim: int, index: int) -> Tensor:
     if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
         return infinicore.ntops.torch.select_scatter(input, src, dim, index)
 
-    return Tensor(_infinicore.select_scatter(input._underlying, src._underlying, dim, index))
\ No newline at end of file
+    return Tensor(
+        _infinicore.select_scatter(input._underlying, src._underlying, dim, index)
+    )