From 1a87d69479d124b2bf7246ac49aa86bd7fe4a690 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 01:02:03 +0000 Subject: [PATCH 1/6] net: mana: Create separate EQs for each vPort To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ sharing among the vPorts and create dedicated EQs for each vPort. Move the EQ definition from struct mana_context to struct mana_port_context and update related support functions. Export mana_create_eq() and mana_destroy_eq() for use by the MANA RDMA driver. --- drivers/infiniband/hw/mana/main.c | 14 ++- drivers/infiniband/hw/mana/qp.c | 4 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 111 ++++++++++-------- include/net/mana/mana.h | 7 +- 4 files changed, 83 insertions(+), 53 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index fac159f7128d96..cfa954460585de 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, pd->vport_use_count--; WARN_ON(pd->vport_use_count < 0); - if (!pd->vport_use_count) + if (!pd->vport_use_count) { + mana_destroy_eq(mpc); mana_uncfg_vport(mpc); + } mutex_unlock(&pd->vport_mutex); } @@ -55,15 +57,21 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, return err; } - mutex_unlock(&pd->vport_mutex); pd->tx_shortform_allowed = mpc->tx_shortform_allowed; pd->tx_vp_offset = mpc->tx_vp_offset; + err = mana_create_eq(mpc); + if (err) { + mana_uncfg_vport(mpc); + pd->vport_use_count--; + } + + mutex_unlock(&pd->vport_mutex); ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", mpc->port_handle, pd->pdn, doorbell_id); - return 0; + return err; } int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 48c1f4977f218e..d71c301b29c22d 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -189,7 +189,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq = &mpc->ac->eqs[cq->comp_vector]; + eq = &mpc->eqs[cq->comp_vector % mpc->num_queues]; cq_spec.attached_eq = eq->eq->id; ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, @@ -341,7 +341,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; eq_vec = send_cq->comp_vector; - eq = &mpc->ac->eqs[eq_vec]; + eq = &mpc->eqs[eq_vec % mpc->num_queues]; cq_spec.attached_eq = eq->eq->id; err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 9b5a72ada5c445..566e45a66adf26 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1590,79 +1590,83 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, } EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA"); -static void mana_destroy_eq(struct mana_context *ac) +void mana_destroy_eq(struct mana_port_context *apc) { + struct mana_context *ac = apc->ac; struct gdma_context *gc = ac->gdma_dev->gdma_context; struct gdma_queue *eq; int i; - if (!ac->eqs) + if (!apc->eqs) return; - debugfs_remove_recursive(ac->mana_eqs_debugfs); - ac->mana_eqs_debugfs = NULL; + debugfs_remove_recursive(apc->mana_eqs_debugfs); + apc->mana_eqs_debugfs = NULL; - for (i = 0; i < gc->max_num_queues; i++) { - eq = ac->eqs[i].eq; + for (i = 0; i < apc->num_queues; i++) { + eq = apc->eqs[i].eq; if (!eq) continue; mana_gd_destroy_queue(gc, eq); } - kfree(ac->eqs); - ac->eqs = NULL; + kfree(apc->eqs); + apc->eqs = NULL; } +EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA"); -static void mana_create_eq_debugfs(struct mana_context *ac, int i) +static void mana_create_eq_debugfs(struct mana_port_context *apc, int i) { - struct mana_eq eq = ac->eqs[i]; + struct mana_eq eq = apc->eqs[i]; char eqnum[32]; sprintf(eqnum, "eq%d", i); - eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs); + eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs); debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head); debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail); debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops); } -static int mana_create_eq(struct mana_context *ac) +int mana_create_eq(struct mana_port_context *apc) { - struct gdma_dev *gd = ac->gdma_dev; + struct gdma_dev *gd = apc->ac->gdma_dev; struct gdma_context *gc = gd->gdma_context; struct gdma_queue_spec spec = {}; int err; int i; - ac->eqs = kcalloc(gc->max_num_queues, sizeof(struct mana_eq), - GFP_KERNEL); - if (!ac->eqs) + WARN_ON(apc->eqs); + apc->eqs = kcalloc(apc->num_queues, sizeof(struct mana_eq), + GFP_KERNEL); + if (!apc->eqs) return -ENOMEM; spec.type = GDMA_EQ; spec.monitor_avl_buf = false; spec.queue_size = EQ_SIZE; spec.eq.callback = NULL; - spec.eq.context = ac->eqs; + spec.eq.context = apc->eqs; spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; - ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs); + apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs); - for (i = 0; i < gc->max_num_queues; i++) { + for (i = 0; i < apc->num_queues; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; - err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq); + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); if (err) { dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err); goto out; } - mana_create_eq_debugfs(ac, i); + mana_create_eq_debugfs(apc, i); } return 0; out: - mana_destroy_eq(ac); + mana_destroy_eq(apc); return err; } +EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA"); static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq) { @@ -2381,7 +2385,7 @@ static int mana_create_txq(struct mana_port_context *apc, spec.monitor_avl_buf = false; spec.queue_size = cq_size; spec.cq.callback = mana_schedule_napi; - spec.cq.parent_eq = ac->eqs[i].eq; + spec.cq.parent_eq = apc->eqs[i].eq; spec.cq.context = cq; err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); if (err) @@ -2775,13 +2779,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx) static int mana_add_rx_queues(struct mana_port_context *apc, struct net_device *ndev) { - struct mana_context *ac = apc->ac; struct mana_rxq *rxq; int err = 0; int i; for (i = 0; i < apc->num_queues; i++) { - rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev); + rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev); if (!rxq) { err = -ENOMEM; netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err); @@ -2800,9 +2803,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc, return err; } -static void mana_destroy_vport(struct mana_port_context *apc) +static void mana_destroy_rxqs(struct mana_port_context *apc) { - struct gdma_dev *gd = apc->ac->gdma_dev; struct mana_rxq *rxq; u32 rxq_idx; @@ -2814,8 +2816,12 @@ static void mana_destroy_vport(struct mana_port_context *apc) mana_destroy_rxq(apc, rxq, true); apc->rxqs[rxq_idx] = NULL; } +} + +static void mana_destroy_vport(struct mana_port_context *apc) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; - mana_destroy_txq(apc); mana_uncfg_vport(apc); if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) @@ -2836,11 +2842,7 @@ static int mana_create_vport(struct mana_port_context *apc, return err; } - err = mana_cfg_vport(apc, gd->pdid, gd->doorbell); - if (err) - return err; - - return mana_create_txq(apc, net); + return mana_cfg_vport(apc, gd->pdid, gd->doorbell); } static int mana_rss_table_alloc(struct mana_port_context *apc) @@ -3117,21 +3119,36 @@ int mana_alloc_queues(struct net_device *ndev) err = mana_create_vport(apc, ndev); if (err) { - netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err); + netdev_err(ndev, "Failed to create vPort %u : %d\n", + apc->port_idx, err); return err; } + err = mana_create_eq(apc); + if (err) { + netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n", + apc->port_idx, err); + goto destroy_vport; + } + + err = mana_create_txq(apc, ndev); + if (err) { + netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n", + apc->port_idx, err); + goto destroy_eq; + } + err = netif_set_real_num_tx_queues(ndev, apc->num_queues); if (err) { netdev_err(ndev, "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n", apc->num_queues, err); - goto destroy_vport; + goto destroy_txq; } err = mana_add_rx_queues(apc, ndev); if (err) - goto destroy_vport; + goto destroy_rxq; apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE; @@ -3140,7 +3157,7 @@ int mana_alloc_queues(struct net_device *ndev) netdev_err(ndev, "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n", apc->num_queues, err); - goto destroy_vport; + goto destroy_rxq; } mana_rss_table_init(apc); @@ -3148,19 +3165,25 @@ int mana_alloc_queues(struct net_device *ndev) err = mana_config_rss(apc, TRI_STATE_TRUE, true, true); if (err) { netdev_err(ndev, "Failed to configure RSS table: %d\n", err); - goto destroy_vport; + goto destroy_rxq; } if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_filter(apc); if (err) - goto destroy_vport; + goto destroy_rxq; } mana_chn_setxdp(apc, mana_xdp_get(apc)); return 0; +destroy_rxq: + mana_destroy_rxqs(apc); +destroy_txq: + mana_destroy_txq(apc); +destroy_eq: + mana_destroy_eq(apc); destroy_vport: mana_destroy_vport(apc); return err; @@ -3263,6 +3286,9 @@ static int mana_dealloc_queues(struct net_device *ndev) netdev_err(ndev, "Failed to disable vPort: %d\n", err); /* Even in err case, still need to cleanup the vPort */ + mana_destroy_rxqs(apc); + mana_destroy_txq(apc); + mana_destroy_eq(apc); mana_destroy_vport(apc); return 0; @@ -3570,12 +3596,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming) gd->driver_data = ac; } - err = mana_create_eq(ac); - if (err) { - dev_err(dev, "Failed to create EQs: %d\n", err); - goto out; - } - err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION, &num_ports, &bm_hostmode); if (err) @@ -3714,7 +3734,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending) free_netdev(ndev); } - mana_destroy_eq(ac); out: if (ac->per_port_queue_reset_wq) { destroy_workqueue(ac->per_port_queue_reset_wq); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index a078af283bddbb..787e637059df97 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -478,8 +478,6 @@ struct mana_context { u8 bm_hostmode; struct mana_ethtool_hc_stats hc_stats; - struct mana_eq *eqs; - struct dentry *mana_eqs_debugfs; struct workqueue_struct *per_port_queue_reset_wq; /* Workqueue for querying hardware stats */ struct delayed_work gf_stats_work; @@ -499,6 +497,9 @@ struct mana_port_context { u8 mac_addr[ETH_ALEN]; + struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; + enum TRI_STATE rss_state; mana_handle_t default_rxobj; @@ -1023,6 +1024,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); +int mana_create_eq(struct mana_port_context *apc); +void mana_destroy_eq(struct mana_port_context *apc); struct net_device *mana_get_primary_netdev(struct mana_context *ac, u32 port_index, From 17ba6db46c56e06b31eedd781cf7e4d3d66d5e76 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 02:05:21 +0000 Subject: [PATCH 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs When querying the device, adjust the max number of queues to allow dedicated MSI-X vectors for each vPort. The number of queues per vPort is clamped to no less than 16. MSI-X sharing among vPorts is disabled by default and is only enabled when there are not enough MSI-X vectors for dedicated allocation. Rename mana_query_device_cfg() to mana_gd_query_device_cfg() as it is used at GDMA device probe time for querying device capabilities. --- .../net/ethernet/microsoft/mana/gdma_main.c | 66 ++++++++++++++++--- drivers/net/ethernet/microsoft/mana/mana_en.c | 36 +++++----- include/net/mana/gdma.h | 10 +++ 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 0055c231acf6d5..62e3a2eb68e0f8 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -107,6 +107,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_query_max_resources_resp resp = {}; struct gdma_general_req req = {}; + unsigned int max_num_queues; + u8 bm_hostmode; + u16 num_ports; int err; mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, @@ -152,6 +155,40 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) if (gc->max_num_queues > gc->num_msix_usable - 1) gc->max_num_queues = gc->num_msix_usable - 1; + err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &num_ports, &bm_hostmode); + if (err) + return err; + + if (!num_ports) + return -EINVAL; + + /* + * Adjust gc->max_num_queues returned from the SOC to allow dedicated MSIx + * for each vPort. Reduce max_num_queues to no less than 16 if necessary + */ + max_num_queues = (gc->num_msix_usable - 1) / num_ports; + max_num_queues = roundup_pow_of_two(max(max_num_queues, 1U)); + if (max_num_queues < 16) + max_num_queues = 16; + + /* + * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for + * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1) + */ + max_num_queues = min(gc->max_num_queues, max_num_queues); + if (max_num_queues * num_ports > gc->num_msix_usable - 1) + gc->msi_sharing = true; + + /* If MSI is shared, use max allowed value */ + if (gc->msi_sharing) + gc->max_num_queues_vport = min(gc->num_msix_usable - 1, gc->max_num_queues); + else + gc->max_num_queues_vport = max_num_queues; + + dev_info(gc->dev, "MSI sharing mode %d max queues %d\n", + gc->msi_sharing, gc->max_num_queues); + return 0; } @@ -1802,6 +1839,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev) /* Need 1 interrupt for HWC */ max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1; min_irqs = 2; + gc->msi_sharing = true; } nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX); @@ -1880,6 +1918,8 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) pci_free_irq_vectors(pdev); + bitmap_free(gc->msi_bitmap); + gc->msi_bitmap = NULL; gc->max_num_msix = 0; gc->num_msix_usable = 0; } @@ -1911,20 +1951,30 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; - err = mana_gd_query_max_resources(pdev); + err = mana_gd_detect_devices(pdev); if (err) goto destroy_hwc; - err = mana_gd_setup_remaining_irqs(pdev); - if (err) { - dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); - goto destroy_hwc; - } - - err = mana_gd_detect_devices(pdev); + err = mana_gd_query_max_resources(pdev); if (err) goto destroy_hwc; + if (!gc->msi_sharing) { + gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL); + if (!gc->msi_bitmap) { + err = -ENOMEM; + goto destroy_hwc; + } + /* Set bit for HWC */ + set_bit(0, gc->msi_bitmap); + } else { + err = mana_gd_setup_remaining_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); + goto destroy_hwc; + } + } + dev_dbg(&pdev->dev, "mana gdma setup successful\n"); return 0; diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 566e45a66adf26..1e65670feb177c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1002,10 +1002,9 @@ static int mana_init_port_context(struct mana_port_context *apc) return !apc->rxqs ? -ENOMEM : 0; } -static int mana_send_request(struct mana_context *ac, void *in_buf, - u32 in_len, void *out_buf, u32 out_len) +static int gdma_mana_send_request(struct gdma_context *gc, void *in_buf, + u32 in_len, void *out_buf, u32 out_len) { - struct gdma_context *gc = ac->gdma_dev->gdma_context; struct gdma_resp_hdr *resp = out_buf; struct gdma_req_hdr *req = in_buf; struct device *dev = gc->dev; @@ -1039,6 +1038,14 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, return 0; } +static int mana_send_request(struct mana_context *ac, void *in_buf, + u32 in_len, void *out_buf, u32 out_len) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + + return gdma_mana_send_request(gc, in_buf, in_len, out_buf, out_len); +} + static int mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr, const enum mana_command_code expected_code, const u32 min_size) @@ -1172,11 +1179,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) err, resp.hdr.status); } -static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, - u32 proto_minor_ver, u32 proto_micro_ver, - u16 *max_num_vports, u8 *bm_hostmode) +int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, + u32 proto_minor_ver, u32 proto_micro_ver, + u16 *max_num_vports, u8 *bm_hostmode) { - struct gdma_context *gc = ac->gdma_dev->gdma_context; struct mana_query_device_cfg_resp resp = {}; struct mana_query_device_cfg_req req = {}; struct device *dev = gc->dev; @@ -1191,7 +1197,7 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, req.proto_minor_ver = proto_minor_ver; req.proto_micro_ver = proto_micro_ver; - err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp)); + err = gdma_mana_send_request(gc, &req, sizeof(req), &resp, sizeof(resp)); if (err) { dev_err(dev, "Failed to query config: %d", err); return err; @@ -1219,8 +1225,6 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, else *bm_hostmode = 0; - debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); - return 0; } @@ -3334,7 +3338,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, int err; ndev = alloc_etherdev_mq(sizeof(struct mana_port_context), - gc->max_num_queues); + gc->max_num_queues_vport); if (!ndev) return -ENOMEM; @@ -3343,8 +3347,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, apc = netdev_priv(ndev); apc->ac = ac; apc->ndev = ndev; - apc->max_queues = gc->max_num_queues; - apc->num_queues = gc->max_num_queues; + apc->max_queues = gc->max_num_queues_vport; + apc->num_queues = gc->max_num_queues_vport; apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE; apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE; apc->port_handle = INVALID_MANA_HANDLE; @@ -3596,13 +3600,15 @@ int mana_probe(struct gdma_dev *gd, bool resuming) gd->driver_data = ac; } - err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, - MANA_MICRO_VERSION, &num_ports, &bm_hostmode); + err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &num_ports, &bm_hostmode); if (err) goto out; ac->bm_hostmode = bm_hostmode; + debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); + if (!resuming) { ac->num_ports = num_ports; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 766f4fb25e266d..c515909c94987d 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -394,6 +394,7 @@ struct gdma_context { /* Per-vPort max number of queues */ unsigned int max_num_queues; + unsigned int max_num_queues_vport; unsigned int max_num_msix; unsigned int num_msix_usable; struct xarray irq_contexts; @@ -438,6 +439,12 @@ struct gdma_context { struct workqueue_struct *service_wq; unsigned long flags; + + /* Indicate if this device is sharing MSI for EQs on MANA */ + bool msi_sharing; + + /* Bitmap tracks where MSI is allocated when it is not shared for EQs */ + unsigned long *msi_bitmap; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -999,4 +1006,7 @@ int mana_gd_resume(struct pci_dev *pdev); bool mana_need_log(struct gdma_context *gc, int err); +int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, + u32 proto_minor_ver, u32 proto_micro_ver, + u16 *max_num_vports, u8 *bm_hostmode); #endif /* _GDMA_H */ From fb1f64c7d7b1e283f96c45fff1863be4ccb1d60f Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:35:52 +0000 Subject: [PATCH 3/6] net: mana: Introduce GIC context with refcounting for interrupt management To allow Ethernet EQs to use dedicated or shared MSI-X vectors and RDMA EQs to share the same MSI-X, introduce a GIC (GDMA IRQ Context) with reference counting. This allows the driver to create an interrupt context on an assigned or unassigned MSI-X vector and share it across multiple EQ consumers. --- .../net/ethernet/microsoft/mana/gdma_main.c | 158 ++++++++++++++++++ include/net/mana/gdma.h | 10 ++ 2 files changed, 168 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 62e3a2eb68e0f8..8c66e314f03c75 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1558,6 +1558,163 @@ static irqreturn_t mana_gd_intr(int irq, void *arg) return IRQ_HANDLED; } +void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi) +{ + struct pci_dev *dev = to_pci_dev(gc->dev); + struct msi_map irq_map; + struct gdma_irq_context *gic; + int irq; + + mutex_lock(&gc->gic_mutex); + + gic = xa_load(&gc->irq_contexts, msi); + if (WARN_ON(!gic)) { + mutex_unlock(&gc->gic_mutex); + return; + } + + if (use_msi_bitmap) + gic->bitmap_refs--; + + if (use_msi_bitmap && gic->bitmap_refs == 0) + clear_bit(msi, gc->msi_bitmap); + + if (!refcount_dec_and_test(&gic->refcount)) + goto out; + + irq = pci_irq_vector(dev, msi); + + irq_update_affinity_hint(irq, NULL); + free_irq(irq, gic); + + if (pci_msix_can_alloc_dyn(dev)) { + irq_map.virq = irq; + irq_map.index = msi; + pci_msix_free_irq(dev, irq_map); + } + + xa_erase(&gc->irq_contexts, msi); + kfree(gic); + +out: + mutex_unlock(&gc->gic_mutex); +} +EXPORT_SYMBOL_NS(mana_gd_put_gic, "NET_MANA"); + +/* + * Get a GIC (GDMA IRQ Context) on a MSI vector + * a MSI can be shared between difference EQs, this function supports setting + * up separate MSIs using a bitmap, or directly using the MSI index + * + * @use_msi_bitmap: + * True if MSI is assigned by this function on available slots from bitmap. + * False if MSI is passed from *msi_requested + */ +struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc, + bool use_msi_bitmap, + int *msi_requested) +{ + struct gdma_irq_context *gic; + struct pci_dev *dev = to_pci_dev(gc->dev); + struct msi_map irq_map = { }; + int irq; + int msi; + int err; + + mutex_lock(&gc->gic_mutex); + + if (use_msi_bitmap) { + msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable); + if (msi >= gc->num_msix_usable) { + dev_err(gc->dev, "No free MSI vectors available\n"); + gic = NULL; + goto out; + } + *msi_requested = msi; + } else { + msi = *msi_requested; + } + + gic = xa_load(&gc->irq_contexts, msi); + if (gic) { + refcount_inc(&gic->refcount); + if (use_msi_bitmap) { + gic->bitmap_refs++; + set_bit(msi, gc->msi_bitmap); + } + goto out; + } + + irq = pci_irq_vector(dev, msi); + if (irq == -EINVAL) { + irq_map = pci_msix_alloc_irq_at(dev, msi, NULL); + if (!irq_map.virq) { + err = irq_map.index; + dev_err(gc->dev, + "Failed to alloc irq_map msi %d err %d\n", + msi, err); + gic = NULL; + goto out; + } + irq = irq_map.virq; + msi = irq_map.index; + } + + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + if (irq_map.virq) + pci_msix_free_irq(dev, irq_map); + goto out; + } + + gic->handler = mana_gd_process_eq_events; + gic->msi = msi; + gic->irq = irq; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); + + if (!gic->msi) + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", + pci_name(dev)); + else + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s", + gic->msi, pci_name(dev)); + + err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); + if (err) { + dev_err(gc->dev, "Failed to request irq %d %s\n", + irq, gic->name); + kfree(gic); + gic = NULL; + if (irq_map.virq) + pci_msix_free_irq(dev, irq_map); + goto out; + } + + refcount_set(&gic->refcount, 1); + gic->bitmap_refs = use_msi_bitmap ? 1 : 0; + + err = xa_err(xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL)); + if (err) { + dev_err(gc->dev, "Failed to store irq context for msi %d: %d\n", + msi, err); + free_irq(irq, gic); + kfree(gic); + gic = NULL; + if (irq_map.virq) + pci_msix_free_irq(dev, irq_map); + goto out; + } + + if (use_msi_bitmap) + set_bit(msi, gc->msi_bitmap); + +out: + mutex_unlock(&gc->gic_mutex); + return gic; +} +EXPORT_SYMBOL_NS(mana_gd_get_gic, "NET_MANA"); + int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r) { r->map = bitmap_zalloc(res_avail, GFP_KERNEL); @@ -2040,6 +2197,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto release_region; mutex_init(&gc->eq_test_event_mutex); + mutex_init(&gc->gic_mutex); pci_set_drvdata(pdev, gc); gc->bar0_pa = pci_resource_start(pdev, 0); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index c515909c94987d..9c4945b304004e 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -382,6 +382,10 @@ struct gdma_irq_context { spinlock_t lock; struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; + unsigned int msi; + unsigned int irq; + refcount_t refcount; + unsigned int bitmap_refs; }; enum gdma_context_flags { @@ -440,6 +444,9 @@ struct gdma_context { unsigned long flags; + /* Protect access to GIC context */ + struct mutex gic_mutex; + /* Indicate if this device is sharing MSI for EQs on MANA */ bool msi_sharing; @@ -1006,6 +1013,9 @@ int mana_gd_resume(struct pci_dev *pdev); bool mana_need_log(struct gdma_context *gc, int err); +struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc, bool use_msi_bitmap, + int *msi_requested); +void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi); int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver, u32 proto_minor_ver, u32 proto_micro_ver, u16 *max_num_vports, u8 *bm_hostmode); From e1b03a3b0702182752d0710b0e4eccf624dd8145 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:43:54 +0000 Subject: [PATCH 4/6] net: mana: Use GIC functions to allocate global EQs Replace the GDMA global interrupt setup code with the new GIC allocation and release functions for managing interrupt contexts. --- .../net/ethernet/microsoft/mana/gdma_main.c | 83 +++---------------- 1 file changed, 10 insertions(+), 73 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 8c66e314f03c75..9f5381821def68 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1830,30 +1830,13 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) * further used in irq_setup() */ for (i = 1; i <= nvec; i++) { - gic = kzalloc(sizeof(*gic), GFP_KERNEL); + gic = mana_gd_get_gic(gc, false, &i); if (!gic) { err = -ENOMEM; goto free_irq; } - gic->handler = mana_gd_process_eq_events; - INIT_LIST_HEAD(&gic->eq_list); - spin_lock_init(&gic->lock); - - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", - i - 1, pci_name(pdev)); - - /* one pci vector is already allocated for HWC */ - irqs[i - 1] = pci_irq_vector(pdev, i); - if (irqs[i - 1] < 0) { - err = irqs[i - 1]; - goto free_current_gic; - } - - err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_current_gic; - xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); + irqs[i - 1] = gic->irq; } /* @@ -1875,19 +1858,11 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) kfree(irqs); return 0; -free_current_gic: - kfree(gic); free_irq: for (i -= 1; i > 0; i--) { irq = pci_irq_vector(pdev, i); - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + mana_gd_put_gic(gc, false, i); } kfree(irqs); return err; @@ -1908,34 +1883,13 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) start_irqs = irqs; for (i = 0; i < nvec; i++) { - gic = kzalloc(sizeof(*gic), GFP_KERNEL); + gic = mana_gd_get_gic(gc, false, &i); if (!gic) { err = -ENOMEM; goto free_irq; } - gic->handler = mana_gd_process_eq_events; - INIT_LIST_HEAD(&gic->eq_list); - spin_lock_init(&gic->lock); - - if (!i) - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", - pci_name(pdev)); - else - snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", - i - 1, pci_name(pdev)); - - irqs[i] = pci_irq_vector(pdev, i); - if (irqs[i] < 0) { - err = irqs[i]; - goto free_current_gic; - } - - err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_current_gic; - - xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); + irqs[i] = gic->irq; } /* If number of IRQ is one extra than number of online CPUs, @@ -1964,19 +1918,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) kfree(start_irqs); return 0; -free_current_gic: - kfree(gic); free_irq: for (i -= 1; i >= 0; i--) { irq = pci_irq_vector(pdev, i); - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + mana_gd_put_gic(gc, false, i); } kfree(start_irqs); @@ -2051,26 +1997,17 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev) static void mana_gd_remove_irqs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); - struct gdma_irq_context *gic; int irq, i; if (gc->max_num_msix < 1) return; - for (i = 0; i < gc->max_num_msix; i++) { - irq = pci_irq_vector(pdev, i); - if (irq < 0) - continue; - - gic = xa_load(&gc->irq_contexts, i); - if (WARN_ON(!gic)) - continue; - + for (i = 0; i < (gc->msi_sharing ? gc->max_num_msix : 1); i++) { /* Need to clear the hint before free_irq */ + irq = pci_irq_vector(pdev, i); irq_update_affinity_hint(irq, NULL); - free_irq(irq, gic); - xa_erase(&gc->irq_contexts, i); - kfree(gic); + + mana_gd_put_gic(gc, false, i); } pci_free_irq_vectors(pdev); From ce9bb95eaefed57ae5fa4c5312a085aa04110f82 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:50:48 +0000 Subject: [PATCH 5/6] net: mana: Allocate interrupt context for each EQ when creating vPort Use GIC functions to create a dedicated interrupt context or acquire a shared interrupt context for each EQ when setting up a vPort. --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 17 ++++++++++++++++- include/net/mana/gdma.h | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 9f5381821def68..3252d969f69df7 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -808,7 +808,6 @@ static void mana_gd_deregister_irq(struct gdma_queue *queue) } spin_unlock_irqrestore(&gic->lock, flags); - queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; synchronize_rcu(); } @@ -923,6 +922,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd, out: dev_err(dev, "Failed to create EQ: %d\n", err); mana_gd_destroy_eq(gc, false, queue); + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; return err; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 1e65670feb177c..c0bd520dd54df5 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1600,6 +1600,7 @@ void mana_destroy_eq(struct mana_port_context *apc) struct gdma_context *gc = ac->gdma_dev->gdma_context; struct gdma_queue *eq; int i; + unsigned int msi; if (!apc->eqs) return; @@ -1612,7 +1613,9 @@ void mana_destroy_eq(struct mana_port_context *apc) if (!eq) continue; + msi = eq->eq.msix_index; mana_gd_destroy_queue(gc, eq); + mana_gd_put_gic(gc, !gc->msi_sharing, msi); } kfree(apc->eqs); @@ -1629,6 +1632,7 @@ static void mana_create_eq_debugfs(struct mana_port_context *apc, int i) eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs); debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head); debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail); + debugfs_create_u32("irq", 0400, eq.mana_eq_debugfs, &eq.eq->eq.irq); debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops); } @@ -1639,6 +1643,7 @@ int mana_create_eq(struct mana_port_context *apc) struct gdma_queue_spec spec = {}; int err; int i; + struct gdma_irq_context *gic; WARN_ON(apc->eqs); apc->eqs = kcalloc(apc->num_queues, sizeof(struct mana_eq), @@ -1656,12 +1661,22 @@ int mana_create_eq(struct mana_port_context *apc) apc->mana_eqs_debugfs = debugfs_create_dir("EQs", apc->mana_port_debugfs); for (i = 0; i < apc->num_queues; i++) { - spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + if (gc->msi_sharing) + spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + + gic = mana_gd_get_gic(gc, !gc->msi_sharing, &spec.eq.msix_index); + if (!gic) { + err = -ENOMEM; + goto out; + } + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); if (err) { dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err); + mana_gd_put_gic(gc, !gc->msi_sharing, spec.eq.msix_index); goto out; } + apc->eqs[i].eq->eq.irq = gic->irq; mana_create_eq_debugfs(apc, i); } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 9c4945b304004e..1f12ba8c022a95 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -336,6 +336,7 @@ struct gdma_queue { void *context; unsigned int msix_index; + unsigned int irq; u32 log2_throttle_limit; } eq; From d03d242b88896655b24e29d9d8129068f3bd0de9 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 19 Jun 2025 03:56:54 +0000 Subject: [PATCH 6/6] RDMA/mana_ib: Allocate interrupt contexts on EQs Use the GIC functions to allocate interrupt contexts for RDMA EQs. These interrupt contexts may be shared with Ethernet EQs when MSI-X vectors are limited. The driver now supports allocating dedicated MSI-X for each EQ. Indicate this capability through driver capability bits. --- drivers/infiniband/hw/mana/main.c | 33 ++++++++++++++++++++++++++----- include/net/mana/gdma.h | 5 ++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index cfa954460585de..029609fb91c59a 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -787,6 +787,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue_spec spec = {}; + struct gdma_irq_context *gic; int err, i; spec.type = GDMA_EQ; @@ -797,9 +798,15 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; spec.eq.msix_index = 0; + gic = mana_gd_get_gic(gc, false, &spec.eq.msix_index); + if (!gic) + return -ENOMEM; + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq); - if (err) + if (err) { + mana_gd_put_gic(gc, false, 0); return err; + } mdev->eqs = kcalloc(mdev->ib_dev.num_comp_vectors, sizeof(struct gdma_queue *), GFP_KERNEL); @@ -810,31 +817,47 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.eq.callback = NULL; for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + + gic = mana_gd_get_gic(gc, false, &spec.eq.msix_index); + if (!gic) { + err = -ENOMEM; + goto destroy_eqs; + } + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); - if (err) + if (err) { + mana_gd_put_gic(gc, false, spec.eq.msix_index); goto destroy_eqs; + } } return 0; destroy_eqs: - while (i-- > 0) + while (i-- > 0) { mana_gd_destroy_queue(gc, mdev->eqs[i]); + mana_gd_put_gic(gc, false, (i + 1) % gc->num_msix_usable); + } kfree(mdev->eqs); destroy_fatal_eq: mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + mana_gd_put_gic(gc, false, 0); return err; } void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); - int i; + int i, msi; mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + mana_gd_put_gic(gc, false, 0); - for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { mana_gd_destroy_queue(gc, mdev->eqs[i]); + msi = (i + 1) % gc->num_msix_usable; + mana_gd_put_gic(gc, false, msi); + } kfree(mdev->eqs); } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 1f12ba8c022a95..5210f81318ce90 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -609,6 +609,8 @@ enum { /* Driver supports dynamic MSI-X vector allocation */ #define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) +/* Driver supports separate EQ/MSIs for each vPort */ +#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19) /* Driver can self reset on EQE notification */ #define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) @@ -643,7 +645,8 @@ enum { GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \ GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \ - GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY) + GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \ + GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT) #define GDMA_DRV_CAP_FLAGS2 0