docs: Update ML library versions and documentation

PrathamTI · PrathamTI · commit 285c64480a34 · 2026-04-07T15:26:44.000+05:30
This commit updates the ML library versions and documentation

- Update ARM Compute Library from 24.12 to 52.7.0
- Update Arm NN from 24.11 to 26.01
- Update NNStreamer from 2.4.2 to 2.6.0
- Update ONNX Runtime from 1.20.1 to 1.23.2
- Update TensorFlow Lite from 2.18.0 to 2.20.0
- Refresh all test outputs and benchmark results
- Add ML components to AM62DX documentation TOC
- Update component table with latest library information

Signed-off-by: Pratham Deshmukh &lt;p-deshmukh@ti.com&gt;
diff --git a/configs/AM62DX/AM62DX_linux_toc.txt b/configs/AM62DX/AM62DX_linux_toc.txt
@@ -62,6 +62,12 @@ linux/Foundational_Components/Kernel/Kernel_Drivers/UART
 linux/Foundational_Components/Kernel/Kernel_Drivers/UBIFS
 linux/Foundational_Components/Kernel/Kernel_Drivers/VTM
 linux/Foundational_Components/Kernel/Kernel_Drivers/Watchdog
+linux/Foundational_Components_Machine_Learning
+linux/Foundational_Components/Machine_Learning/arm_compute_library
+linux/Foundational_Components/Machine_Learning/armnn
+linux/Foundational_Components/Machine_Learning/nnstreamer
+linux/Foundational_Components/Machine_Learning/onnxrt
+linux/Foundational_Components/Machine_Learning/tflite
 
 #linux/Foundational_Components_Power_Management
 
diff --git a/source/images/Sitara_machine_learning_stack_diagram.jpeg b/source/images/Sitara_machine_learning_stack_diagram.jpeg
diff --git a/source/images/Sitara_machine_learning_stack_diagram.png b/source/images/Sitara_machine_learning_stack_diagram.png
diff --git a/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst b/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst
@@ -10,7 +10,7 @@ Exact list of functions can be found at https://www.arm.com/products/development
 Supported versions
 ------------------
 
-  - ARM Compute Library 24.12
+  - ARM Compute Library 52.7.0
 
 Arm Compute Library Testing
 ---------------------------
@@ -19,10 +19,10 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
 
 .. code-block:: console
 
-    root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/lib/tests/ /usr/lib/tests/arm_compute_validation
-    Version = 32bcced2af7feea6969dd1d22e58d0718dc488e3
-    CommandLine = /usr/lib/tests/arm_compute_validation 
-    Seed = 3778037091
+    root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/bin/arm-compute-library-52.7.0/tests/ /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation
+    Version = c9a1fff898abd5109b759e8e16616519dc758fdd
+    CommandLine = /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation
+    Seed = 165977448
     cpu_has_sve = false
     cpu_has_sve2 = false
     cpu_has_svef32mm = false
@@ -34,22 +34,23 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
     cpu_has_bf16 = false
     cpu_has_dotprod = false
     cpu_has_i8mm = false
+    cpu_has_fhm = false
     CPU0 = A53
     CPU1 = A53
     CPU2 = A53
     CPU3 = A53
     Iterations = 1
     Threads = 1
     Dataset mode = PRECOMMIT
-    Running [0] 'UNIT/CPPScheduler/RethrowException'
-    Wall clock/Wall clock time:    AVG=3466.0000 us
+    Running [0] 'UNIT/DataTypeUtils/CheckDataTypeIsPrinted@DataType=QSYMM8'
+    Wall clock/Wall clock time:    AVG=3.0000 us
 
 
 .. code-block:: console
 
-    root@am62xx-evm:~# /usr/bin/arm-compute-library-24.12/examples/graph_alexnet
+    root@am62xx-evm:~# /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet
 
-    /usr/bin/arm-compute-library-24.12/examples/graph_alexnet
+    /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet
 
     Threads : 1
     Target : Neon
@@ -58,8 +59,8 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil
     Tuner enabled? : false
     Cache enabled? : false
     Tuner mode : Normal
-    Tuner file : 
-    MLGO file : 
+    Tuner file :
+    MLGO file :
     Fast math enabled? : false
 
     Test passed
@@ -69,16 +70,17 @@ Sample NN related executables (using Arm Compute Library only):
 
 .. code-block:: console
 
-    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/graph_*
-    graph_alexnet              graph_inception_v4         graph_resnext50            graph_vgg19
-    graph_deepspeech_v0_4_1    graph_lenet                graph_shufflenet           graph_vgg_vdsr
-    graph_edsr                 graph_mobilenet            graph_squeezenet           graph_yolov3
-    graph_googlenet            graph_mobilenet_v2         graph_squeezenet_v1_1
-    graph_inception_resnet_v1  graph_resnet12             graph_srcnn955
-    graph_inception_resnet_v2  graph_resnet50             graph_ssd_mobilenet
-    graph_inception_v3         graph_resnet_v2_50         graph_vgg16
+    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/graph_*
+    graph_alexnet              graph_lenet         graph_squeezenet
+    graph_deepspeech_v0_4_1    graph_mobilenet     graph_squeezenet_v1_1
+    graph_edsr                 graph_mobilenet_v2  graph_srcnn955
+    graph_googlenet            graph_resnet12      graph_ssd_mobilenet
+    graph_inception_resnet_v1  graph_resnet50      graph_vgg16
+    graph_inception_resnet_v2  graph_resnet_v2_50  graph_vgg19
+    graph_inception_v3         graph_resnext50     graph_vgg_vdsr
+    graph_inception_v4         graph_shufflenet    graph_yolov3
 
 .. code-block:: console
 
-    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/neon_*
+    root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/neon_*
     neon_cnn           neon_copy_objects  neon_gemm_qasymm8  neon_gemm_s8_f32   neon_permute       neon_scale         neon_sgemm
diff --git a/source/linux/Foundational_Components/Machine_Learning/armnn.rst b/source/linux/Foundational_Components/Machine_Learning/armnn.rst
@@ -23,4 +23,4 @@ in conjunction with the TIDL TensorFlow Lite Delegate.
 Supported versions
 ------------------
 
-  - Arm NN 24.11
+  - Arm NN 26.01
diff --git a/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst b/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst
@@ -12,15 +12,15 @@ https://nnstreamer.ai/
 Supported versions
 ------------------
 
-  - NNStreamer 2.4.2
+  - NNStreamer 2.6.0
 
 Testing NNStreamer
 ------------------
 
 .. code-block:: console
 
     root@am62xx-evm:~# nnstreamer-check
-    NNStreamer version: 2.4.2
+    NNStreamer version: 2.6.0
                loaded : TRUE
                path   : /usr/lib/gstreamer-1.0/libnnstreamer.so
     ...
diff --git a/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst b/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst
@@ -18,7 +18,7 @@ https://onnxruntime.ai/
 Supported version
 -----------------
 
-  - ONNX Runtime 1.20.1
+  - ONNX Runtime 1.23.2
 
 ONNX Runtime test applications
 ------------------------------
@@ -34,7 +34,7 @@ Running benchmark_model
     usage: perf_test [options...] model_path [result_file]
     Options:
 	-m [test_mode]: Specifies the test mode. Value could be 'duration' or 'times'.
-		Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. 
+		Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times.
 	-M: Disable memory pattern.
 	-A: Disable memory arena
 	-I: Generate tensor input binding (Free dimensions are treated as 1.)
@@ -55,19 +55,19 @@ Running benchmark_model
 	-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).
 		Please see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.
 	-u [optimized_model_path]: Specify the optimized model path for saving.
-	-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). 
-	-q [CUDA only] use separate stream for copy. 
+	-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default).
+	-q [CUDA only] use separate stream for copy.
 	-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.
-	-C: Specify session configuration entries as key-value pairs: -C "<key1>|<value1> <key2>|<value2>" 
-	    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. 
-	    [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1" 
-	-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: 
+	-C: Specify session configuration entries as key-value pairs: -C "<key1>|<value1> <key2>|<value2>"
+	    Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
+	    [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1"
+	-i: Specify EP specific runtime options as key value pairs. Different runtime options available are:
 	    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'
 
-	    [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', 
+	    [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false',
 
 	-T [Set intra op thread affinities]: Specify intra op thread affinity string
-	 [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 
+	 [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6
 		 Use semicolon to separate configuration between threads.
 		 E.g. 1,2;3,4;5,6 specifies affinities for three threads, the first thread will be attached to the first and second logical processor.
 		 The number of affinities must be equal to intra_op_num_threads - 1
@@ -84,22 +84,22 @@ Example of running *onnxruntime_perf_test* on target using the pre-installed mob
 .. code-block:: console
 
     # /usr/bin/onnxruntime-tests/onnxruntime_perf_test -I -m times -r 8 -e acl -P /usr/bin/onnxruntime-tests/testdata/mobilenet_v3_small_excerpt.onnx
-    Session creation time cost: 0.0273071 s
-    First inference time cost: 20 ms
-    Total inference time cost: 0.14188 s
+    Session creation time cost: 0.139671 s
+    First inference time cost: 15 ms
+    Total inference time cost: 0.126396 s
     Total inference requests: 8
-    Average inference time cost: 17.735 ms
-    Total inference run time: 0.141991 s
-    Number of inferences per second: 56.3415 
-    Avg CPU usage: 98 %
-    Peak working set size: 35299328 bytes
-    Avg CPU usage:98
-    Peak working set size:35299328
+    Average inference time cost: 15.7995 ms
+    Total inference run time: 0.126518 s
+    Number of inferences per second: 63.232
+    Avg CPU usage: 100 %
+    Peak working set size: 37994496 bytes
+    Avg CPU usage:100
+    Peak working set size:37994496
     Runs:8
-    Min Latency: 0.0159831 s
-    Max Latency: 0.0232702 s
-    P50 Latency: 0.0167086 s
-    P90 Latency: 0.0232702 s
-    P95 Latency: 0.0232702 s
-    P99 Latency: 0.0232702 s
-    P999 Latency: 0.0232702 s
+    Min Latency: 0.00955697 s
+    Max Latency: 0.0239688 s
+    P50 Latency: 0.0156388 s
+    P90 Latency: 0.0239688 s
+    P95 Latency: 0.0239688 s
+    P99 Latency: 0.0239688 s
+    P999 Latency: 0.0239688 s
diff --git a/source/linux/Foundational_Components/Machine_Learning/tflite.rst b/source/linux/Foundational_Components/Machine_Learning/tflite.rst
@@ -18,7 +18,7 @@ It supports on-device inference with low latency and a compact binary size. You
 Features
 ********
 
-  - TensorFlow Lite v2.18.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb <https://web.git.yoctoproject.org/meta-arago/tree/meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb?h=11.00.09>`__
+  - TensorFlow Lite v2.20.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.20.0.bb <https://web.git.yoctoproject.org/meta-arago/tree/meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb?h=11.00.09>`__
   - Multithreaded computation with acceleration using Arm Neon SIMD instructions on Cortex-A cores
   - C++ Library and Python interpreter (supported Python version 3)
   - TensorFlow Lite Model benchmark Tool (i.e. :command:`benchmark_model`)
@@ -89,23 +89,21 @@ The output of the benchmarking application should be similar to:
    root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=false
    INFO: STARTING!
    INFO: Log parameter values verbosely: [0]
-   INFO: Num threads: [4]
    INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite]
    INFO: Signature to run: []
-   INFO: #threads used for CPU inference: [4]
    INFO: Use xnnpack: [0]
    INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite
    INFO: The input model file size (MB): 67.3128
-   INFO: Initialized session in 6.418ms.
+   INFO: Initialized session in 5.579ms.
    INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
-   INFO: count=1 curr=1041765
+   INFO: count=1 curr=1357602 p5=1357602 median=1357602 p95=1357602
 
    INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
-   INFO: count=50 first=977738 curr=964908 min=911877 max=1112273 avg=971535 std=39112
+   INFO: count=50 first=1249964 curr=1240143 min=1238588 max=1252566 avg=1.24027e+06 std=2565 p5=1238753 median=1239807 p95=1247415
 
-   INFO: Inference timings in us: Init: 6418, First inference: 1041765, Warmup (avg): 1.04176e+06, Inference (avg): 971535
+   INFO: Inference timings in us: Init: 5579, First inference: 1357602, Warmup (avg): 1.3576e+06, Inference (avg): 1.24027e+06
    INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion.
-   INFO: Memory footprint delta from the start of the tool (MB): init=6.14844 overall=109.848
+   INFO: Memory footprint delta from the start of the tool (MB): init=6.36328 overall=109.832
 
 Where,
 
@@ -130,26 +128,23 @@ The output of the benchmarking application should be similar to,
    root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=true
    INFO: STARTING!
    INFO: Log parameter values verbosely: [0]
-   INFO: Num threads: [4]
    INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite]
    INFO: Signature to run: []
-   INFO: #threads used for CPU inference: [4]
    INFO: Use xnnpack: [1]
    INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite
    INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
    INFO: XNNPACK delegate created.
    INFO: Explicitly applied XNNPACK delegate, and the model graph will be partially executed by the delegate w/ 1 delegate kernels.
    INFO: The input model file size (MB): 67.3128
-   INFO: Initialized session in 592.232ms.
+   INFO: Initialized session in 614.333ms.
    INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
-   INFO: count=1 curr=633430
-
+   INFO: count=1 curr=905463 p5=905463 median=905463 p95=905463
    INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
-   INFO: count=50 first=605745 curr=618849 min=568228 max=722188 avg=602943 std=27690
-
-   INFO: Inference timings in us: Init: 592232, First inference: 633430, Warmup (avg): 633430, Inference (avg): 602943
+   INFO: count=50 first=900416 curr=898333 min=898007 max=906121 avg=899641 std=1549 p5=898333 median=899281 p95=904305
+   INFO: Inference timings in us: Init: 614333, First inference: 905463, Warmup (avg): 905463, Inference (avg): 899641
    INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion.
-   INFO: Memory footprint delta from the start of the tool (MB): init=133.086 overall=149.531
+   INFO: Memory footprint delta from the start of the tool (MB): init=146.363 overall=150.141
+
 
 Where,
 
@@ -166,14 +161,14 @@ The following performance numbers are captured with :command:`benchmark_model` o
    :header: "SOC", "Delegates", "Inference Time (sec)", "Initialization Time (ms)", "Overall Memory Footprint (MB)"
    :widths: 10, 10, 20, 20, 20
 
-   "AM62X", "CPU only", "0.977168", "6.129", "110.07"
-   "", "XNNPACK", "0.613474", "593.558", "149.699"
-   "AM62PX", "CPU only", "0.419261", "4.79", "108.707"
-   "", "XNNPACK", "0.274756", "1208.04", "149.395"
-   "AM64X", "CPU only", "1.10675", "144.535", "109.562"
-   "", "XNNPACK", "0.702809", "601.33", "149.602"
-   "AM62L", "CPU only", "1.04867", "6.088", "110.129"
-   "", "XNNPACK", "0.661133", "466.216", "149.703"
+   "AM62X", "CPU only", "1.24027", "5.579", "109.832"
+   "", "XNNPACK", "0.899641", "614.333", "150.141"
+   "AM62PX", "CPU only", "1.23341", "252.390", "111.121"
+   "", "XNNPACK", "0.875280", "597.639", "150.52"
+   "AM64X", "CPU only", "1.26429", "135.579", "110.188"
+   "", "XNNPACK", "0.740743", "885.636", "150.484"
+   "AM62L", "CPU only", "1.3708", "807.076", "111.152"
+   "", "XNNPACK", "0.930577", "769.145", "150.496"
 
 Based on the above data, using the XNNPACK delegate significantly improves inference times across all SoCs, though it generally increases initialization time and overall memory footprint.
 
diff --git a/source/linux/Foundational_Components_Machine_Learning.rst b/source/linux/Foundational_Components_Machine_Learning.rst