feat(linux): Update ML library versions and documentation

PrathamTI · PrathamTI · commit 985f73e20352 · 2026-04-15T11:39:58.000+05:30
This commit updates the ML library versions and documentation

- Update ARM Compute Library from 24.12 to 52.7.0
- Update Arm NN from 24.11 to 26.01
- Update NNStreamer from 2.4.2 to 2.6.0
- Update ONNX Runtime from 1.20.1 to 1.23.2
- Update TensorFlow Lite from 2.18.0 to 2.20.0
- Refresh all test outputs and benchmark results
- Add ML components to AM62DX documentation TOC
- Update component table with latest library information

Signed-off-by: Pratham Deshmukh &lt;p-deshmukh@ti.com&gt;
diff --git a/configs/AM62DX/AM62DX_linux_toc.txt b/configs/AM62DX/AM62DX_linux_toc.txt
@@ -62,6 +62,12 @@ linux/Foundational_Components/Kernel/Kernel_Drivers/UART
 linux/Foundational_Components/Kernel/Kernel_Drivers/UBIFS
 linux/Foundational_Components/Kernel/Kernel_Drivers/VTM
 linux/Foundational_Components/Kernel/Kernel_Drivers/Watchdog
+linux/Foundational_Components_Machine_Learning
+linux/Foundational_Components/Machine_Learning/arm_compute_library
+linux/Foundational_Components/Machine_Learning/armnn
+linux/Foundational_Components/Machine_Learning/nnstreamer
+linux/Foundational_Components/Machine_Learning/onnxrt
+linux/Foundational_Components/Machine_Learning/tflite
 
 #linux/Foundational_Components_Power_Management
 
diff --git a/source/images/Sitara_machine_learning_stack_diagram.png b/source/images/Sitara_machine_learning_stack_diagram.png
diff --git a/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst b/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst
@@ -10,7 +10,7 @@ Exact list of functions can be found at https://www.arm.com/products/development
 Supported versions
 ------------------
 
-  - ARM Compute Library 24.12
+  - ARM Compute Library 52.7.0
 
 Arm Compute Library Testing
 ---------------------------
@@ -19,10 +19,10 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
 
 .. code-block:: console
 
-    root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/lib/tests/ /usr/lib/tests/arm_compute_validation
-    Version = 32bcced2af7feea6969dd1d22e58d0718dc488e3
-    CommandLine = /usr/lib/tests/arm_compute_validation 
-    Seed = 3778037091
+    root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/bin/arm-compute-library-52.7.0/tests/ /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation
+    Version = c9a1fff898abd5109b759e8e16616519dc758fdd
+    CommandLine = /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation
+    Seed = 165977448
     cpu_has_sve = false
     cpu_has_sve2 = false
     cpu_has_svef32mm = false
@@ -34,22 +34,23 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
     cpu_has_bf16 = false
     cpu_has_dotprod = false
     cpu_has_i8mm = false
+    cpu_has_fhm = false
     CPU0 = A53
     CPU1 = A53
     CPU2 = A53
     CPU3 = A53
     Iterations = 1
     Threads = 1
     Dataset mode = PRECOMMIT
-    Running [0] 'UNIT/CPPScheduler/RethrowException'
-    Wall clock/Wall clock time:    AVG=3466.0000 us
+    Running [0] 'UNIT/DataTypeUtils/CheckDataTypeIsPrinted@DataType=QSYMM8'
+    Wall clock/Wall clock time:    AVG=3.0000 us
 
 
 .. code-block:: console
 
-    root@am62xx-evm:~# /usr/bin/arm-compute-library-24.12/examples/graph_alexnet
+    root@am62xx-evm:~# /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet
 
-    /usr/bin/arm-compute-library-24.12/examples/graph_alexnet
+    /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet
 
     Threads : 1
     Target : Neon
@@ -58,8 +59,8 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
     Tuner enabled? : false
     Cache enabled? : false
     Tuner mode : Normal
-    Tuner file : 
-    MLGO file : 
+    Tuner file :
+    MLGO file :
     Fast math enabled? : false
 
     Test passed
@@ -69,16 +70,17 @@ Sample NN related executables (using Arm Compute Library only):
 
 .. code-block:: console
 
-    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/graph_*
-    graph_alexnet              graph_inception_v4         graph_resnext50            graph_vgg19
-    graph_deepspeech_v0_4_1    graph_lenet                graph_shufflenet           graph_vgg_vdsr
-    graph_edsr                 graph_mobilenet            graph_squeezenet           graph_yolov3
-    graph_googlenet            graph_mobilenet_v2         graph_squeezenet_v1_1
-    graph_inception_resnet_v1  graph_resnet12             graph_srcnn955
-    graph_inception_resnet_v2  graph_resnet50             graph_ssd_mobilenet
-    graph_inception_v3         graph_resnet_v2_50         graph_vgg16
+    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/graph_*
+    graph_alexnet              graph_lenet         graph_squeezenet
+    graph_deepspeech_v0_4_1    graph_mobilenet     graph_squeezenet_v1_1
+    graph_edsr                 graph_mobilenet_v2  graph_srcnn955
+    graph_googlenet            graph_resnet12      graph_ssd_mobilenet
+    graph_inception_resnet_v1  graph_resnet50      graph_vgg16
+    graph_inception_resnet_v2  graph_resnet_v2_50  graph_vgg19
+    graph_inception_v3         graph_resnext50     graph_vgg_vdsr
+    graph_inception_v4         graph_shufflenet    graph_yolov3
 
 .. code-block:: console
 
-    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/neon_*
+    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/neon_*
     neon_cnn           neon_copy_objects  neon_gemm_qasymm8  neon_gemm_s8_f32   neon_permute       neon_scale         neon_sgemm
diff --git a/source/linux/Foundational_Components/Machine_Learning/armnn.rst b/source/linux/Foundational_Components/Machine_Learning/armnn.rst
@@ -23,4 +23,4 @@ in conjunction with the TIDL TensorFlow Lite Delegate.
 Supported versions
 ------------------
 
-  - Arm NN 24.11
+  - Arm NN 26.01
diff --git a/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst b/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst
@@ -12,15 +12,15 @@ https://nnstreamer.ai/
 Supported versions
 ------------------
 
-  - NNStreamer 2.4.2
+  - NNStreamer 2.6.0
 
 Testing NNStreamer
 ------------------
 
 .. code-block:: console
 
     root@am62xx-evm:~# nnstreamer-check
-    NNStreamer version: 2.4.2
+    NNStreamer version: 2.6.0
                loaded : TRUE
                path   : /usr/lib/gstreamer-1.0/libnnstreamer.so
     ...
diff --git a/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst b/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst
@@ -18,7 +18,7 @@ https://onnxruntime.ai/
 Supported version
 -----------------
 
-  - ONNX Runtime 1.20.1
+  - ONNX Runtime 1.23.2
 
 ONNX Runtime test applications
 ------------------------------
@@ -34,7 +34,7 @@ Running benchmark_model
     usage: perf_test [options...] model_path [result_file]
     Options:
 	-m [test_mode]: Specifies the test mode. Value could be 'duration' or 'times'.
-		Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. 
+		Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times.
 	-M: Disable memory pattern.
 	-A: Disable memory arena
 	-I: Generate tensor input binding (Free dimensions are treated as 1.)
@@ -55,19 +55,19 @@ Running benchmark_model
 	-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).
 		Please see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.
 	-u [optimized_model_path]: Specify the optimized model path for saving.
-	-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). 
-	-q [CUDA only] use separate stream for copy. 
+	-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default).
+	-q [CUDA only] use separate stream for copy.
 	-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.
-	-C: Specify session configuration entries as key-value pairs: -C "<key1>|<value1> <key2>|<value2>" 
-	    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. 
-	    [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1" 
-	-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: 
+	-C: Specify session configuration entries as key-value pairs: -C "<key1>|<value1> <key2>|<value2>"
+	    Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
+	    [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1"
+	-i: Specify EP specific runtime options as key value pairs. Different runtime options available are:
 	    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'
 
-	    [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', 
+	    [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false',
 
 	-T [Set intra op thread affinities]: Specify intra op thread affinity string
-	 [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 
+	 [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6
 		 Use semicolon to separate configuration between threads.
 		 E.g. 1,2;3,4;5,6 specifies affinities for three threads, the first thread will be attached to the first and second logical processor.
 		 The number of affinities must be equal to intra_op_num_threads - 1
@@ -84,22 +84,22 @@ Example of running *onnxruntime_perf_test* on target using the pre-installed mob
 .. code-block:: console
 
     # /usr/bin/onnxruntime-tests/onnxruntime_perf_test -I -m times -r 8 -e acl -P /usr/bin/onnxruntime-tests/testdata/mobilenet_v3_small_excerpt.onnx
-    Session creation time cost: 0.0273071 s
-    First inference time cost: 20 ms
-    Total inference time cost: 0.14188 s
+    Session creation time cost: 0.139671 s
+    First inference time cost: 15 ms
+    Total inference time cost: 0.126396 s
     Total inference requests: 8
-    Average inference time cost: 17.735 ms
-    Total inference run time: 0.141991 s
-    Number of inferences per second: 56.3415 
-    Avg CPU usage: 98 %
-    Peak working set size: 35299328 bytes
-    Avg CPU usage:98
-    Peak working set size:35299328
+    Average inference time cost: 15.7995 ms
+    Total inference run time: 0.126518 s
+    Number of inferences per second: 63.232
+    Avg CPU usage: 100 %
+    Peak working set size: 37994496 bytes
+    Avg CPU usage:100
+    Peak working set size:37994496
     Runs:8
-    Min Latency: 0.0159831 s
-    Max Latency: 0.0232702 s
-    P50 Latency: 0.0167086 s
-    P90 Latency: 0.0232702 s
-    P95 Latency: 0.0232702 s
-    P99 Latency: 0.0232702 s
-    P999 Latency: 0.0232702 s
+    Min Latency: 0.00955697 s
+    Max Latency: 0.0239688 s
+    P50 Latency: 0.0156388 s
+    P90 Latency: 0.0239688 s
+    P95 Latency: 0.0239688 s
+    P99 Latency: 0.0239688 s
+    P999 Latency: 0.0239688 s
diff --git a/source/linux/Foundational_Components/Machine_Learning/tflite.rst b/source/linux/Foundational_Components/Machine_Learning/tflite.rst
@@ -18,7 +18,7 @@ It supports on-device inference with low latency and a compact binary size. You
 Features
 ********
 
-  - TensorFlow Lite v2.18.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb <https://web.git.yoctoproject.org/meta-arago/tree/meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb?h=11.00.09>`__
+  - TensorFlow Lite v2.20.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.20.0.bb <https://web.git.yoctoproject.org/meta-arago/tree/meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb?h=11.00.09>`__
   - Multithreaded computation with acceleration using Arm Neon SIMD instructions on Cortex-A cores
   - C++ Library and Python interpreter (supported Python version 3)
   - TensorFlow Lite Model benchmark Tool (i.e. :command:`benchmark_model`)
@@ -89,23 +89,21 @@ The output of the benchmarking application should be similar to:
    root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=false
    INFO: STARTING!
    INFO: Log parameter values verbosely: [0]
-   INFO: Num threads: [4]
    INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite]
    INFO: Signature to run: []
-   INFO: #threads used for CPU inference: [4]
    INFO: Use xnnpack: [0]
    INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite
    INFO: The input model file size (MB): 67.3128
-   INFO: Initialized session in 6.418ms.
+   INFO: Initialized session in 5.579ms.
    INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
-   INFO: count=1 curr=1041765
+   INFO: count=1 curr=1357602 p5=1357602 median=1357602 p95=1357602
 
    INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
-   INFO: count=50 first=977738 curr=964908 min=911877 max=1112273 avg=971535 std=39112
+   INFO: count=50 first=1249964 curr=1240143 min=1238588 max=1252566 avg=1.24027e+06 std=2565 p5=1238753 median=1239807 p95=1247415
 
-   INFO: Inference timings in us: Init: 6418, First inference: 1041765, Warmup (avg): 1.04176e+06, Inference (avg): 971535
+   INFO: Inference timings in us: Init: 5579, First inference: 1357602, Warmup (avg): 1.3576e+06, Inference (avg): 1.24027e+06
    INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion.
-   INFO: Memory footprint delta from the start of the tool (MB): init=6.14844 overall=109.848
+   INFO: Memory footprint delta from the start of the tool (MB): init=6.36328 overall=109.832
 
 Where,
 
@@ -130,26 +128,23 @@ The output of the benchmarking application should be similar to,
    root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=true
    INFO: STARTING!
    INFO: Log parameter values verbosely: [0]
-   INFO: Num threads: [4]
    INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite]
    INFO: Signature to run: []
-   INFO: #threads used for CPU inference: [4]
    INFO: Use xnnpack: [1]
    INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite
    INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
    INFO: XNNPACK delegate created.
    INFO: Explicitly applied XNNPACK delegate, and the model graph will be partially executed by the delegate w/ 1 delegate kernels.
    INFO: The input model file size (MB): 67.3128
-   INFO: Initialized session in 592.232ms.
+   INFO: Initialized session in 614.333ms.
    INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
-   INFO: count=1 curr=633430
-
+   INFO: count=1 curr=905463 p5=905463 median=905463 p95=905463
    INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
-   INFO: count=50 first=605745 curr=618849 min=568228 max=722188 avg=602943 std=27690
-
-   INFO: Inference timings in us: Init: 592232, First inference: 633430, Warmup (avg): 633430, Inference (avg): 602943
+   INFO: count=50 first=900416 curr=898333 min=898007 max=906121 avg=899641 std=1549 p5=898333 median=899281 p95=904305
+   INFO: Inference timings in us: Init: 614333, First inference: 905463, Warmup (avg): 905463, Inference (avg): 899641
    INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion.
-   INFO: Memory footprint delta from the start of the tool (MB): init=133.086 overall=149.531
+   INFO: Memory footprint delta from the start of the tool (MB): init=146.363 overall=150.141
+
 
 Where,
 
@@ -166,14 +161,14 @@ The following performance numbers are captured with :command:`benchmark_model` o
    :header: "SOC", "Delegates", "Inference Time (sec)", "Initialization Time (ms)", "Overall Memory Footprint (MB)"
    :widths: 10, 10, 20, 20, 20
 
-   "AM62X", "CPU only", "0.977168", "6.129", "110.07"
-   "", "XNNPACK", "0.613474", "593.558", "149.699"
-   "AM62PX", "CPU only", "0.419261", "4.79", "108.707"
-   "", "XNNPACK", "0.274756", "1208.04", "149.395"
-   "AM64X", "CPU only", "1.10675", "144.535", "109.562"
-   "", "XNNPACK", "0.702809", "601.33", "149.602"
-   "AM62L", "CPU only", "1.04867", "6.088", "110.129"
-   "", "XNNPACK", "0.661133", "466.216", "149.703"
+   "AM62X", "CPU only", "1.24027", "5.579", "109.832"
+   "", "XNNPACK", "0.899641", "614.333", "150.141"
+   "AM62PX", "CPU only", "1.23341", "252.390", "111.121"
+   "", "XNNPACK", "0.875280", "597.639", "150.52"
+   "AM64X", "CPU only", "1.26429", "135.579", "110.188"
+   "", "XNNPACK", "0.740743", "885.636", "150.484"
+   "AM62L", "CPU only", "1.3708", "807.076", "111.152"
+   "", "XNNPACK", "0.930577", "769.145", "150.496"
 
 Based on the above data, using the XNNPACK delegate significantly improves inference times across all SoCs, though it generally increases initialization time and overall memory footprint.
 
@@ -185,10 +180,12 @@ Based on the above data, using the XNNPACK delegate significantly improves infer
 Example Applications
 ********************
 
-|__SDK_FULL_NAME__| has integrated opensource components like NNStreamer which can be used for neural network inferencing using the sample tflite models under :file:`/usr/share/oob-demo-assets/models/`
-Checkout the Object Detection usecase under :ref:`TI Apps Launcher - User Guide <TI-Apps-Launcher-User-Guide-label>`
+.. ifconfig:: CONFIG_part_variant in ('AM62X', 'AM62LX', 'AM62PX')
 
-Alternatively, if a display is connected, you can run the Object Detection pipeline using this command,
+   |__SDK_FULL_NAME__| has integrated opensource components like NNStreamer which can be used for neural network inferencing using the sample tflite models under :file:`/usr/share/oob-demo-assets/models/`
+   Checkout the Object Detection usecase under :ref:`TI Apps Launcher - User Guide <TI-Apps-Launcher-User-Guide-label>`
+
+   Alternatively, if a display is connected, you can run the Object Detection pipeline using this command,
 
 .. ifconfig:: CONFIG_part_variant in ('AM62X', 'AM62LX')
 
@@ -248,6 +245,47 @@ Alternatively, if a display is connected, you can run the Object Detection pipel
 
    The above GStreamer pipeline reads an H.264 video file, decodes it, and processes it for object detection using a TensorFlow Lite model, displaying bounding boxes around detected objects. The processed video is then composited and rendered on the screen using the ``kmssink`` element.
 
+.. ifconfig:: CONFIG_part_variant in ('AM62DX')
+
+   |__SDK_FULL_NAME__| has integrated opensource components like NNStreamer which can be used for neural network inferencing using the sample TensorFlow Lite models under :file:`/usr/share/oob-demo-assets/models/`
+
+   If an audio input device is connected, you can run the Audio Classification pipeline using this command:
+
+      .. code-block:: console
+
+         gst-launch-1.0 \
+         alsasrc ! \
+         audioconvert ! \
+         audioresample ! \
+         audio/x-raw,format=S16LE,channels=1,rate=16000,layout=interleaved ! \
+         tensor_converter frames-per-tensor=3900 ! \
+         tensor_aggregator \
+               frames-in=3900 \
+               frames-out=15600 \
+               frames-flush=3900 \
+               frames-dim=1 ! \
+         tensor_transform \
+               mode=arithmetic \
+               option=typecast:float32,add:0.5,div:32767.5 ! \
+         tensor_transform \
+               mode=transpose \
+               option=1:0:2:3 ! \
+         queue \
+               leaky=2 \
+               max-size-buffers=10 ! \
+         tensor_filter \
+               framework=tensorflow2-lite \
+               model=/usr/share/oob-demo-assets/models/yamnet_audio_classification.tflite \
+               custom=Delegate:XNNPACK,NumThreads:2 ! \
+         tensor_decoder \
+               mode=image_labeling \
+               option1=/usr/share/oob-demo-assets/labels/yamnet_label_list.txt ! \
+         filesink \
+               buffer-mode=2 \
+               location=/dev/stdout
+
+   The above GStreamer pipeline captures real-time audio from an ALSA source, converts it to the required format, and processes it for audio event classification using the YAMNet TensorFlow Lite model. The audio data is aggregated into tensors, normalized for machine learning input, and classified to identify various audio events and sounds. The classification results are decoded to human-readable labels and output to stdout.
+
 .. attention::
 
    The Example Applications section is not applicable for AM64x
diff --git a/source/linux/Foundational_Components_Machine_Learning.rst b/source/linux/Foundational_Components_Machine_Learning.rst