Skip to content

Commit 68a8e2d

Browse files
nvidia-gpu: add support for PCIe port telemetry
Add xyz.openbmc_project.Inventory.Connector.Port Interface for each PCIe port of a ConnectX device. PDI patches to extend the xyz.openbmc_project.Inventory.Connector.Port Interface - https://gerrit.openbmc.org/c/openbmc/phosphor-dbus-interfaces/+/84653 https://gerrit.openbmc.org/c/openbmc/phosphor-dbus-interfaces/+/84652 Tested: Build an image for nvl32-obmc machine with the following patch cherry picked. https://gerrit.openbmc.org/c/openbmc/openbmc/+/85490 The patch cherry-picks the following patches that are currently under review. ``` 1. device tree https://lore.kernel.org/all/aRbLqH8pLWCQryhu@molberding.nvidia.com/ 2. mctpd patches CodeConstruct/mctp#85 3. u-boot changes https://lore.kernel.org/openbmc/20251121-msx4-v1-0-fc0118b666c1@nvidia.com/T/#t 4. kernel changes as specified in the openbmc patch (for espi) 5. entity-manager changes https://gerrit.openbmc.org/c/openbmc/entity-manager/+/85455 6. platform-init changes https://gerrit.openbmc.org/c/openbmc/platform-init/+/85456 7. spi changes https://lore.kernel.org/all/20251121-w25q01jv_fixup-v1-1-3d175050db73@nvidia.com/ ``` ``` root@nvl32-obmc:~# busctl tree xyz.openbmc_project.GpuSensor `- /xyz `- /xyz/openbmc_project |- /xyz/openbmc_project/inventory | `- /xyz/openbmc_project/inventory/pcie_devices | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_0 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_0/DOWN_0 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_0/DOWN_1 | | `- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_0/UP_0 | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1/DOWN_0 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1/DOWN_1 | | `- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1/UP_0 | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_2 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_2/DOWN_0 | | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_2/DOWN_1 | | `- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_2/UP_0 | `- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_3 | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_3/DOWN_0 | |- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_3/DOWN_1 | `- /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_3/UP_0 `- /xyz/openbmc_project/sensors root@nvl32-obmc:~# busctl -l introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1/DOWN_0 NAME TYPE SIGNATURE RESULT/VALUE FLAGS org.freedesktop.DBus.Introspectable interface - - - .Introspect method - s - org.freedesktop.DBus.Peer interface - - - .GetMachineId method - s - .Ping method - - - org.freedesktop.DBus.Properties interface - - - .Get method ss v - .GetAll method s a{sv} - .Set method ssv - - .PropertiesChanged signal sa{sv}as - - xyz.openbmc_project.Association.Definitions interface - - - .Associations property a(sss) 1 "connected_to" "connecting" "/xyz/openbmc_project/inventory/pcie_devices/Nvidia_ConnectX_1" emits-change xyz.openbmc_project.Inventory.Connector.Port interface - - - .PortProtocol property s "xyz.openbmc_project.Inventory.Connector.Port.PortProtocol.PCIe" emits-change .PortType property s "xyz.openbmc_project.Inventory.Connector.Port.PortType.DownstreamPort" emits-change .Speed property t 34359738368 emits-change .Width property u 16 emits-change ``` Change-Id: I2845f090ac92c8ff6a742ec83c23073e6ea4e1b6 Signed-off-by: Harshit Aghera <haghera@nvidia.com>
1 parent b4f775c commit 68a8e2d

8 files changed

Lines changed: 802 additions & 4 deletions

src/nvidia-gpu/NvidiaGpuMctpVdm.cpp

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <cerrno>
1313
#include <cstddef>
1414
#include <cstdint>
15+
#include <cstring>
1516
#include <span>
1617
#include <vector>
1718

@@ -564,12 +565,105 @@ int decodeQueryScalarGroupTelemetryV2Response(
564565
telemetryValues.resize(numTelemetryValues);
565566
}
566567

567-
const auto* telemetryDataPtr = reinterpret_cast<const uint32_t*>(
568-
buf.data() + sizeof(ocp::accelerator_management::CommonResponse));
568+
const auto* telemetryDataPtr =
569+
buf.data() + sizeof(ocp::accelerator_management::CommonResponse);
569570

570571
for (size_t i = 0; i < numTelemetryValues; i++)
571572
{
572-
telemetryValues[i] = le32toh(telemetryDataPtr[i]);
573+
std::memcpy(&telemetryValues[i],
574+
telemetryDataPtr + i * sizeof(uint32_t), sizeof(uint32_t));
575+
576+
telemetryValues[i] = le32toh(telemetryValues[i]);
577+
}
578+
579+
return 0;
580+
}
581+
582+
int encodeListPciePortsRequest(uint8_t instanceId, std::span<uint8_t> buf)
583+
{
584+
if (buf.size() < sizeof(ocp::accelerator_management::CommonRequest))
585+
{
586+
return EINVAL;
587+
}
588+
589+
auto* msg = reinterpret_cast<ocp::accelerator_management::CommonRequest*>(
590+
buf.data());
591+
592+
ocp::accelerator_management::BindingPciVidInfo header{};
593+
header.ocp_accelerator_management_msg_type =
594+
static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
595+
header.instance_id = instanceId &
596+
ocp::accelerator_management::instanceIdBitMask;
597+
header.msg_type = static_cast<uint8_t>(MessageType::PCIE_LINK);
598+
599+
auto rc = packHeader(header, msg->msgHdr.hdr);
600+
601+
if (rc != 0)
602+
{
603+
return rc;
604+
}
605+
606+
msg->command = static_cast<uint8_t>(PcieLinkCommands::ListPCIePorts);
607+
msg->data_size = 0;
608+
609+
return 0;
610+
}
611+
612+
int decodeListPciePortsResponse(
613+
std::span<const uint8_t> buf,
614+
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
615+
uint16_t& numUpstreamPorts, std::vector<uint8_t>& numDownstreamPorts)
616+
{
617+
auto rc =
618+
ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
619+
620+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
621+
{
622+
return rc;
623+
}
624+
625+
if (buf.size() < sizeof(ListPCIePortsResponse))
626+
{
627+
return EINVAL;
628+
}
629+
630+
const auto* response =
631+
reinterpret_cast<const ListPCIePortsResponse*>(buf.data());
632+
633+
const uint16_t dataSize = le16toh(response->hdr.data_size);
634+
635+
if (dataSize < sizeof(uint16_t))
636+
{
637+
return EINVAL;
638+
}
639+
640+
uint16_t upstreamPorts = le16toh(response->numUpstreamPorts);
641+
642+
numUpstreamPorts = 0;
643+
numDownstreamPorts.clear();
644+
numDownstreamPorts.reserve(upstreamPorts);
645+
646+
size_t offset = sizeof(ListPCIePortsResponse);
647+
648+
for (size_t i = 0; i < upstreamPorts; i++)
649+
{
650+
if (offset + sizeof(ListPCIePortsDownstreamPortsData) > buf.size())
651+
{
652+
return EINVAL;
653+
}
654+
655+
const auto* downstreamPortData =
656+
reinterpret_cast<const ListPCIePortsDownstreamPortsData*>(
657+
buf.data() + offset);
658+
659+
// Count only external upstream ports
660+
if (downstreamPortData->isInternal == 0)
661+
{
662+
++numUpstreamPorts;
663+
numDownstreamPorts.push_back(downstreamPortData->count);
664+
}
665+
666+
offset += sizeof(ListPCIePortsDownstreamPortsData);
573667
}
574668

575669
return 0;

src/nvidia-gpu/NvidiaGpuMctpVdm.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ enum class PlatformEnvironmentalCommands : uint8_t
4848

4949
enum class PcieLinkCommands : uint8_t
5050
{
51+
ListPCIePorts = 0x07,
5152
QueryScalarGroupTelemetryV2 = 0x24,
5253
};
5354

@@ -176,6 +177,18 @@ struct GetVoltageResponse
176177
uint32_t voltage;
177178
} __attribute__((packed));
178179

180+
struct ListPCIePortsResponse
181+
{
182+
ocp::accelerator_management::CommonResponse hdr;
183+
uint16_t numUpstreamPorts;
184+
} __attribute__((packed));
185+
186+
struct ListPCIePortsDownstreamPortsData
187+
{
188+
uint8_t isInternal;
189+
uint8_t count;
190+
} __attribute__((packed));
191+
179192
struct GetInventoryInformationRequest
180193
{
181194
ocp::accelerator_management::CommonRequest hdr;
@@ -255,4 +268,11 @@ int decodeQueryScalarGroupTelemetryV2Response(
255268
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
256269
size_t& numTelemetryValues, std::vector<uint32_t>& telemetryValues);
257270

271+
int encodeListPciePortsRequest(uint8_t instanceId, std::span<uint8_t> buf);
272+
273+
int decodeListPciePortsResponse(
274+
std::span<const uint8_t> buf,
275+
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
276+
uint16_t& numUpstreamPorts, std::vector<uint8_t>& numDownstreamPorts);
277+
258278
} // namespace gpu

src/nvidia-gpu/NvidiaPcieDevice.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,25 @@
66
#include "NvidiaPcieDevice.hpp"
77

88
#include "NvidiaDeviceDiscovery.hpp"
9+
#include "NvidiaGpuMctpVdm.hpp"
910
#include "NvidiaPcieInterface.hpp"
11+
#include "NvidiaPciePort.hpp"
1012
#include "Utils.hpp"
1113

1214
#include <MctpRequester.hpp>
15+
#include <OcpMctpVdm.hpp>
1316
#include <boost/asio/io_context.hpp>
1417
#include <phosphor-logging/lg2.hpp>
1518
#include <sdbusplus/asio/connection.hpp>
1619
#include <sdbusplus/asio/object_server.hpp>
1720

1821
#include <chrono>
1922
#include <cstdint>
23+
#include <format>
2024
#include <memory>
25+
#include <span>
2126
#include <string>
27+
#include <system_error>
2228

2329
PcieDevice::PcieDevice(const SensorConfigs& configs, const std::string& name,
2430
const std::string& path,
@@ -34,6 +40,65 @@ PcieDevice::PcieDevice(const SensorConfigs& configs, const std::string& name,
3440

3541
void PcieDevice::init()
3642
{
43+
getPciePortCounts();
44+
}
45+
46+
void PcieDevice::getPciePortCounts()
47+
{
48+
const int rc = gpu::encodeListPciePortsRequest(0, getPciePortCountsRequest);
49+
50+
if (rc != 0)
51+
{
52+
lg2::error(
53+
"Error updating PCIe Port Counts: encode failed, rc={RC}, EID={EID}",
54+
"RC", rc, "EID", eid);
55+
return;
56+
}
57+
58+
mctpRequester.sendRecvMsg(
59+
eid, getPciePortCountsRequest,
60+
[weak{weak_from_this()}](const std::error_code& ec,
61+
std::span<const uint8_t> buffer) {
62+
std::shared_ptr<PcieDevice> self = weak.lock();
63+
if (!self)
64+
{
65+
lg2::error("Invalid reference to PcieDevice");
66+
return;
67+
}
68+
self->processPciePortCountsResponse(ec, buffer);
69+
});
70+
}
71+
72+
void PcieDevice::processPciePortCountsResponse(
73+
const std::error_code& ec, std::span<const uint8_t> response)
74+
{
75+
if (ec)
76+
{
77+
lg2::error(
78+
"Error processing PCIe Port Counts response: sending message over MCTP failed, rc={RC}, EID={EID}",
79+
"RC", ec.message(), "EID", eid);
80+
return;
81+
}
82+
83+
ocp::accelerator_management::CompletionCode cc{};
84+
uint16_t reasonCode = 0;
85+
86+
const int rc = gpu::decodeListPciePortsResponse(
87+
response, cc, reasonCode, pcieDeviceInfo.numUpstreamPorts,
88+
pcieDeviceInfo.numDownstreamPorts);
89+
90+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
91+
{
92+
lg2::error(
93+
"Error processing PCIe Port Counts response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
94+
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
95+
eid);
96+
return;
97+
}
98+
99+
lg2::info("PCIe Device with eid {EID} has {UP} upstream ports.", "EID", eid,
100+
"UP", pcieDeviceInfo.numUpstreamPorts);
101+
37102
makeSensors();
38103
}
39104

@@ -42,6 +107,30 @@ void PcieDevice::makeSensors()
42107
pcieInterface = std::make_shared<NvidiaPcieInterface>(
43108
conn, mctpRequester, name, path, eid, objectServer);
44109

110+
uint64_t downstreamPortIndex = 0;
111+
112+
for (uint64_t i = 0; i < pcieDeviceInfo.numUpstreamPorts; ++i)
113+
{
114+
const std::string portName = std::format("UP_{}", i);
115+
116+
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
117+
conn, mctpRequester, portName, name, path, eid,
118+
gpu::PciePortType::UPSTREAM, i, i, objectServer));
119+
120+
for (uint64_t j = 0; j < pcieDeviceInfo.numDownstreamPorts[i]; ++j)
121+
{
122+
const std::string portName =
123+
std::format("DOWN_{}", downstreamPortIndex);
124+
125+
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
126+
conn, mctpRequester, portName, name, path, eid,
127+
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
128+
objectServer));
129+
130+
++downstreamPortIndex;
131+
}
132+
}
133+
45134
lg2::info("Added PCIe {NAME} Sensors with chassis path: {PATH}.", "NAME",
46135
name, "PATH", path);
47136

@@ -52,6 +141,11 @@ void PcieDevice::read()
52141
{
53142
pcieInterface->update();
54143

144+
for (auto& port : pciePorts)
145+
{
146+
port->update();
147+
}
148+
55149
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
56150
waitTimer.async_wait([this](const boost::system::error_code& ec) {
57151
if (ec)

src/nvidia-gpu/NvidiaPcieDevice.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "NvidiaDeviceDiscovery.hpp"
1010
#include "NvidiaPcieInterface.hpp"
1111

12+
#include <NvidiaPciePort.hpp>
1213
#include <boost/asio/io_context.hpp>
1314
#include <boost/asio/steady_timer.hpp>
1415
#include <sdbusplus/asio/connection.hpp>
@@ -18,11 +19,18 @@
1819
#include <cstdint>
1920
#include <memory>
2021
#include <string>
22+
#include <vector>
2123

2224
constexpr const char* pcieDevicePathPrefix =
2325
"/xyz/openbmc_project/inventory/pcie_devices/";
2426

25-
class PcieDevice
27+
struct PcieDeviceInfo
28+
{
29+
uint16_t numUpstreamPorts{};
30+
std::vector<uint8_t> numDownstreamPorts;
31+
};
32+
33+
class PcieDevice : public std::enable_shared_from_this<PcieDevice>
2634
{
2735
public:
2836
PcieDevice(const SensorConfigs& configs, const std::string& name,
@@ -44,6 +52,13 @@ class PcieDevice
4452

4553
void read();
4654

55+
void getPciePortCounts();
56+
57+
void processPciePortCountsResponse(const std::error_code& ec,
58+
std::span<const uint8_t> response);
59+
60+
PcieDeviceInfo pcieDeviceInfo;
61+
4762
uint8_t eid{};
4863

4964
std::chrono::milliseconds sensorPollMs;
@@ -62,5 +77,10 @@ class PcieDevice
6277

6378
std::string path;
6479

80+
std::array<uint8_t, sizeof(ocp::accelerator_management::CommonRequest)>
81+
getPciePortCountsRequest{};
82+
6583
std::shared_ptr<NvidiaPcieInterface> pcieInterface;
84+
85+
std::vector<std::shared_ptr<NvidiaPciePortInfo>> pciePorts;
6686
};

0 commit comments

Comments
 (0)