diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 596c306ce52b83..ed115801a1d28e 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -562,6 +562,13 @@ attribute-sets: type: u32 checks: min: 1 + - + name: ddi + doc: Use DDI (Data Direct Interface) for this dmabuf binding if the + dmabuf is backed by a device with a direct PCIe connection to the + NIC. When set, enables direct data transfer between the NIC and + the device, bypassing the CPU root complex. + type: u8 operations: list: @@ -769,6 +776,7 @@ operations: - ifindex - fd - queues + - ddi reply: attributes: - id @@ -794,6 +802,7 @@ operations: attributes: - ifindex - fd + - ddi reply: attributes: - id diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index ea2cd1f5d1d078..16a42553e2a5af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -447,6 +447,7 @@ struct mlx5e_txqsq { u8 min_inline_mode; struct device *pdev; __be32 mkey_be; + __be32 crossing_mkey_be; unsigned long state; unsigned int hw_mtu; struct mlx5_clock *clock; @@ -676,6 +677,8 @@ struct mlx5e_rq { struct mlx5e_mpw_info *info; mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq; __be32 umr_mkey_be; + __be32 ksm_child_mkey_be; + u32 crossing_mkey; u16 num_strides; u16 actual_wq_head; u8 log_stride_sz; @@ -971,6 +974,8 @@ struct mlx5e_priv { struct mlx5e_hv_vhca_stats_agent stats_agent; #endif struct mlx5e_scratchpad scratchpad; + u32 ddi_mkey; /* DDI PA mkey with ethernet PD */ + struct device *ddi_dev; /* cached DDI PCI device */ struct mlx5e_htb *htb; struct mlx5e_mqprio_rl *mqprio_rl; struct dentry *dfs_root; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b6c12460b54a9e..ba369905e1187a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include "eswitch.h" @@ -79,6 +80,8 @@ #include "lib/sd.h" #include "en/pcie_cong_event.h" +#define MLX5_DATA_DIRECT_DEVID 0x2100 + static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev) { if (!MLX5_CAP_GEN(mdev, shampo) || @@ -492,6 +495,74 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, return err; } +static int mlx5e_create_crossing_vhca_mkey(struct mlx5_core_dev *mdev, + u32 crossed_mkey, u64 crossed_len, + u32 *crossing_mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + void *mkc; + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_mkey); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET64(mkc, mkc, len, crossed_len); + + err = mlx5_core_create_mkey(mdev, crossing_mkey, in, inlen); + + kvfree(in); + return err; +} + +static int mlx5e_create_umr_ksm_mkey(struct mlx5_core_dev *mdev, + u64 nentries, u8 log_entry_size, + u32 *umr_mkey) +{ + int inlen; + void *mkc; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KSM); + mlx5e_mkey_set_relaxed_ordering(mdev, mkc); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(mkc, mkc, translations_octword_size, nentries); + MLX5_SET(mkc, mkc, log_page_size, log_entry_size); + MLX5_SET64(mkc, mkc, len, nentries << log_entry_size); + err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen); + + kvfree(in); + return err; +} + static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq) { u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0; @@ -994,6 +1065,88 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, } } + /* For devmem + DDI: switch to KSM mode so each UMR entry carries + * a child mkey. We use the ethernet DDI PA mkey (created with + * data_direct=1 and ethernet PD) as the KSM child key, which + * routes DMA through the DDI PCIe port. + * + * Only switch to KSM if this RQ's binding actually uses DDI. + * Non-DDI devmem bindings stay in MTT mode. + */ + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + rq->mpwqe.ksm_child_mkey_be = 0; + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ && + rq->page_pool && rq->page_pool->mp_ops && + net_devmem_is_ddi(rq->page_pool) && + rq->priv->ddi_mkey) { + u8 page_shift = rq->mpwqe.page_shift; + u8 log_wqe_sz, log_stride_sz; + + /* Destroy old MTT-mode UMR mkey */ + mlx5_core_destroy_mkey(mdev, be32_to_cpu(rq->mpwqe.umr_mkey_be)); + + /* Switch to KSM mode */ + rq->mpwqe.umr_mode = MLX5E_MPWRQ_UMR_MODE_UNALIGNED; + + /* Recalculate UMR parameters for KSM mode */ + rq->mpwqe.pages_per_wqe = + mlx5e_mpwrq_pages_per_wqe(mdev, page_shift, + rq->mpwqe.umr_mode); + rq->mpwqe.umr_wqebbs = + mlx5e_mpwrq_umr_wqebbs(mdev, page_shift, + rq->mpwqe.umr_mode); + rq->mpwqe.mtts_per_wqe = + mlx5e_mpwrq_mtts_per_wqe(mdev, page_shift, + rq->mpwqe.umr_mode); + + /* Recalculate stride count for new WQE size */ + log_wqe_sz = mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, + rq->mpwqe.umr_mode); + log_stride_sz = rq->mpwqe.log_stride_sz; + rq->mpwqe.num_strides = BIT(log_wqe_sz - log_stride_sz); + rq->buff.frame0_sz = (1 << log_stride_sz); + + /* Create new KSM-mode UMR mkey */ + err = mlx5e_create_rq_umr_mkey(mdev, rq); + if (err) { + mlx5_core_warn(mdev, + "Failed to create KSM UMR mkey for DDI: %d\n", + err); + goto err_destroy_page_pool; + } + + /* Store DDI mkey (ethernet PD) as KSM child key for en_rx.c */ + rq->mpwqe.ksm_child_mkey_be = + cpu_to_be32(rq->priv->ddi_mkey); + + /* Rebuild UMR WQE template for new mode */ + mlx5e_build_umr_wqe(rq, rq->icosq, + container_of(&rq->mpwqe.umr_wqe, + struct mlx5e_umr_wqe, hdr)); + + } + + /* Create crossing VHCA mkey for devmem + DDI (Data Direct) */ + rq->mpwqe.crossing_mkey = 0; + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ && + net_devmem_is_ddi(rq->page_pool) && + MLX5_CAP_GEN(mdev, crossing_vhca_mkey)) { + u32 umr_mkey = be32_to_cpu(rq->mpwqe.umr_mkey_be); + u32 wq_size = mlx5_wq_ll_get_size(&rq->mpwqe.wq); + u64 umr_len = (u64)wq_size * rq->mpwqe.mtts_per_wqe << + rq->mpwqe.page_shift; + + err = mlx5e_create_crossing_vhca_mkey(mdev, umr_mkey, + umr_len, + &rq->mpwqe.crossing_mkey); + if (err) { + mlx5_core_warn(mdev, + "Failed to create crossing VHCA mkey: %d\n", + err); + rq->mpwqe.crossing_mkey = 0; + } + } + for (i = 0; i < wq_sz; i++) { if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { struct mlx5e_rx_wqe_ll *wqe = @@ -1007,7 +1160,9 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, wqe->data[0].addr = cpu_to_be64(dma_offset + headroom); wqe->data[0].byte_count = cpu_to_be32(byte_count); - wqe->data[0].lkey = rq->mpwqe.umr_mkey_be; + wqe->data[0].lkey = rq->mpwqe.crossing_mkey ? + cpu_to_be32(rq->mpwqe.crossing_mkey) : + rq->mpwqe.umr_mkey_be; } else { struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i); @@ -1040,6 +1195,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, err_free_mpwqe_info: kvfree(rq->mpwqe.info); err_rq_mkey: + if (rq->mpwqe.crossing_mkey) + mlx5_core_destroy_mkey(mdev, rq->mpwqe.crossing_mkey); mlx5_core_destroy_mkey(mdev, be32_to_cpu(rq->mpwqe.umr_mkey_be)); err_rq_drop_page: mlx5e_free_mpwqe_rq_drop_page(rq); @@ -1065,6 +1222,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: mlx5e_rq_free_shampo(rq); kvfree(rq->mpwqe.info); + if (rq->mpwqe.crossing_mkey) + mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.crossing_mkey); mlx5_core_destroy_mkey(rq->mdev, be32_to_cpu(rq->mpwqe.umr_mkey_be)); mlx5e_free_mpwqe_rq_drop_page(rq); break; @@ -1656,6 +1815,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->clock = mdev->clock; sq->mkey_be = c->mkey_be; + sq->crossing_mkey_be = 0; sq->netdev = c->netdev; sq->mdev = c->mdev; sq->channel = c; @@ -1674,10 +1834,30 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->stop_room = param->stop_room; sq->ptp_cyc2time = mlx5_sq_ts_translator(mdev); + /* Create TX crossing mkey for devmem + DDI. The crossing mkey + * wraps the DDI PA mkey (created by mlx5e_create_ddi_mkey with + * data_direct=1), routing TX DMA through the DDI PCIe port. + */ + if (c->priv->ddi_mkey && + MLX5_CAP_GEN(mdev, crossing_vhca_mkey)) { + u32 tx_crossing_mkey; + + err = mlx5e_create_crossing_vhca_mkey(mdev, + c->priv->ddi_mkey, + U64_MAX, + &tx_crossing_mkey); + if (err) + mlx5_core_warn(mdev, + "Failed to create TX crossing VHCA mkey: %d\n", + err); + else + sq->crossing_mkey_be = cpu_to_be32(tx_crossing_mkey); + } + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); if (err) - return err; + goto err_crossing_mkey; wq->db = &wq->db[MLX5_SND_DBR]; err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); @@ -1688,6 +1868,10 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); +err_crossing_mkey: + if (sq->crossing_mkey_be) + mlx5_core_destroy_mkey(mdev, + be32_to_cpu(sq->crossing_mkey_be)); return err; } @@ -1696,6 +1880,9 @@ void mlx5e_free_txqsq(struct mlx5e_txqsq *sq) { kvfree(sq->dim); mlx5e_free_txqsq_db(sq); + if (sq->crossing_mkey_be) + mlx5_core_destroy_mkey(sq->mdev, + be32_to_cpu(sq->crossing_mkey_be)); mlx5_wq_destroy(&sq->wq_ctrl); } @@ -5255,6 +5442,9 @@ static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, } #endif +static struct device *mlx5e_ndo_get_ddi_device(struct net_device *netdev); +static int mlx5e_create_ddi_mkey(struct mlx5e_priv *priv); + const struct net_device_ops mlx5e_netdev_ops = { .ndo_open = mlx5e_open, .ndo_stop = mlx5e_close, @@ -5296,6 +5486,7 @@ const struct net_device_ops mlx5e_netdev_ops = { .ndo_has_offload_stats = mlx5e_has_offload_stats, .ndo_get_offload_stats = mlx5e_get_offload_stats, #endif + .ndo_get_ddi_device = mlx5e_ndo_get_ddi_device, }; void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) @@ -5670,6 +5861,161 @@ static const struct netdev_queue_mgmt_ops mlx5e_queue_mgmt_ops = { .ndo_queue_get_dma_dev = mlx5e_queue_get_dma_dev, }; +static int mlx5e_query_vuid(struct mlx5_core_dev *mdev, char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(mdev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, 1); + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} + +/* Read DDI device VUID from PCI VPD and compare with the given vuid. + * Returns the PCI device if the VUID matches, NULL otherwise. + */ +static struct pci_dev *mlx5e_data_direct_match_vuid(struct pci_dev *ddi_pdev, + const char *vuid) +{ + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + + vpd_data = pci_vpd_alloc(ddi_pdev, &vpd_size); + if (IS_ERR(vpd_data)) + return NULL; + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + kfree(vpd_data); + return NULL; + } + + if (kw_len == strlen(vuid) && !memcmp(vpd_data + start, vuid, kw_len)) { + kfree(vpd_data); + return ddi_pdev; + } + + kfree(vpd_data); + return NULL; +} + +/* Initialize DDI PCI device for DMA operations */ +static int mlx5e_data_direct_init_pci(struct pci_dev *pdev) +{ + int err; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable DDI PCI device, err=%d\n", err); + return err; + } + + pci_set_master(pdev); + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + goto err_disable; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(&pdev->dev, "Enabling pci atomics failed\n"); + + dev_info(&pdev->dev, "DDI PCI device initialized successfully\n"); + + return 0; + +err_disable: + pci_disable_device(pdev); + return err; +} + +/* Scan PCI bus for a DDI device (0x2100) whose VPD VUID matches this NIC */ +static struct device *mlx5e_find_ddi_dma_device(const char *vuid) +{ + struct pci_dev *pdev = NULL; + + while ((pdev = pci_get_device(PCI_VENDOR_ID_MELLANOX, + MLX5_DATA_DIRECT_DEVID, pdev))) { + if (mlx5e_data_direct_match_vuid(pdev, vuid)) { + if (mlx5e_data_direct_init_pci(pdev)) + return NULL; + return &pdev->dev; + } + } + + return NULL; +} + +static struct device *mlx5e_ndo_get_ddi_device(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + + /* Return cached DDI device if already resolved */ + if (priv->ddi_dev) + goto create_mkey; + + if (!MLX5_CAP_GEN(mdev, data_direct) || + !MLX5_CAP_GEN_2(mdev, query_vuid)) + return NULL; + + /* CX8 firmware rejects UMR WQEs with DDI-mapped DMA addresses + * on PCI function 1 (vendor syndrome 0x52). This affects both + * KSM and MTT UMR modes. Skip DDI for fn 1 until firmware fix. + * IB/RDMA DDI works on fn 1 because it uses regular MRs, not UMR. + */ + if (PCI_FUNC(mdev->pdev->devfn) != 0) + return NULL; + + if (mlx5e_query_vuid(mdev, vuid)) + return NULL; + + priv->ddi_dev = mlx5e_find_ddi_dma_device(vuid); + if (!priv->ddi_dev) + return NULL; + +create_mkey: + /* Lazily create DDI mkey on first dmabuf bind. + * Can't do this in mlx5e_nic_init because the ethernet + * PD (mdev->mlx5e_res.hw_objs.pdn) isn't allocated yet. + */ + if (!priv->ddi_mkey) { + int err = mlx5e_create_ddi_mkey(priv); + if (err) { + netdev_warn(netdev, + "DDI: failed to create mkey: %d\n", + err); + return NULL; + } + netdev_info(netdev, "DDI: using Data Direct device %s\n", + dev_name(priv->ddi_dev)); + } + + return priv->ddi_dev; +} + static void mlx5e_build_nic_netdev(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -5869,6 +6215,60 @@ void mlx5e_destroy_q_counters(struct mlx5e_priv *priv) } } +/* Create a DDI PA mkey with ethernet PD for devmem DDI support. + * This mkey is used as: + * - KSM child key in RX (devmem receive with DDI routing) + * - Crossed mkey target in TX (crossing VHCA mkey for DDI send) + * + * The mkey is created with data_direct=1 which tells firmware to + * route DMA through the DDI PCIe port instead of the CPU root complex. + */ +static int mlx5e_create_ddi_mkey(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(mdev, data_direct)) + return 0; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(mdev, &priv->ddi_mkey, in, inlen); + kvfree(in); + + if (err) { + mlx5_core_warn(mdev, "Failed to create DDI mkey: %d\n", err); + return err; + } + + return 0; +} + +static void mlx5e_destroy_ddi_mkey(struct mlx5e_priv *priv) +{ + if (priv->ddi_mkey) { + mlx5_core_destroy_mkey(priv->mdev, priv->ddi_mkey); + priv->ddi_mkey = 0; + } +} + static int mlx5e_nic_init(struct mlx5_core_dev *mdev, struct net_device *netdev) { @@ -5925,6 +6325,7 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { mlx5e_health_destroy_reporters(priv); + mlx5e_destroy_ddi_mkey(priv); mlx5e_psp_unregister(priv); mlx5e_ktls_cleanup(priv); mlx5e_psp_cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 268e2088475773..e1253a64c6b3b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -639,15 +639,33 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) goto err_unmap; addr = page_pool_get_dma_addr_netmem(frag_page->netmem); - umr_wqe->inline_mtts[i] = (struct mlx5_mtt) { - .ptag = cpu_to_be64(addr | MLX5_EN_WR), - }; + if (rq->mpwqe.ksm_child_mkey_be) { + /* KSM mode for devmem DDI: each entry carries a child + * mkey that routes DMA through the DDI PCIe port. + */ + umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { + .key = rq->mpwqe.ksm_child_mkey_be, + .va = cpu_to_be64(addr), + }; + } else { + umr_wqe->inline_mtts[i] = (struct mlx5_mtt) { + .ptag = cpu_to_be64(addr | MLX5_EN_WR), + }; + } } /* Pad if needed, in case the value set to ucseg->xlt_octowords * in mlx5e_build_umr_wqe() needed alignment. */ - if (rq->mpwqe.pages_per_wqe & (MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1)) { + if (rq->mpwqe.ksm_child_mkey_be) { + if (rq->mpwqe.pages_per_wqe & (MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT - 1)) { + int pad = ALIGN(rq->mpwqe.pages_per_wqe, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT) - + rq->mpwqe.pages_per_wqe; + + memset(&umr_wqe->inline_ksms[rq->mpwqe.pages_per_wqe], 0, + sizeof(*umr_wqe->inline_ksms) * pad); + } + } else if (rq->mpwqe.pages_per_wqe & (MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1)) { int pad = ALIGN(rq->mpwqe.pages_per_wqe, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT) - rq->mpwqe.pages_per_wqe; @@ -662,7 +680,9 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR); - offset = (ix * rq->mpwqe.mtts_per_wqe) * sizeof(struct mlx5_mtt) / MLX5_OCTWORD; + offset = rq->mpwqe.ksm_child_mkey_be ? + (ix * rq->mpwqe.mtts_per_wqe) * sizeof(struct mlx5_ksm) / MLX5_OCTWORD : + (ix * rq->mpwqe.mtts_per_wqe) * sizeof(struct mlx5_mtt) / MLX5_OCTWORD; umr_wqe->hdr.uctrl.xlt_offset = cpu_to_be16(offset); sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 9f0272649fa1ea..65c9ce201d4233 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -43,6 +43,7 @@ #include "en_accel/macsec.h" #include "en/ptp.h" #include +#include static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) { @@ -211,7 +212,10 @@ mlx5e_txwqe_build_dsegs(struct mlx5e_txqsq *sq, struct sk_buff *skb, goto dma_unmap_wqe_err; dseg->addr = cpu_to_be64(dma_addr); - dseg->lkey = sq->mkey_be; + dseg->lkey = (sq->crossing_mkey_be && + net_devmem_niov_is_ddi( + skb_frag_net_iov(frag))) ? + sq->crossing_mkey_be : sq->mkey_be; dseg->byte_count = cpu_to_be32(fsz); mlx5e_dma_push_netmem(sq, skb_frag_netmem(frag), dma_addr, fsz); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ca01eb3f7d2b2..1e0f9d0be67811 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1422,6 +1422,10 @@ struct netdev_net_notifier { * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. + * + * struct device *(*ndo_get_ddi_device)(struct net_device *dev); + * Return the Data Direct Interface (DDI) device for accelerated + * DMA-BUF access, or NULL if DDI is not available. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1677,6 +1681,8 @@ struct net_device_ops { */ const struct net_shaper_ops *net_shaper_ops; #endif + + struct device * (*ndo_get_ddi_device)(struct net_device *dev); }; /** @@ -2417,7 +2423,7 @@ struct net_device { struct dm_hw_stat_delta __rcu *dm_private; #endif struct device dev; - const struct attribute_group *sysfs_groups[5]; + const struct attribute_group *sysfs_groups[6]; const struct attribute_group *sysfs_rx_queue_group; const struct rtnl_link_ops *rtnl_link_ops; diff --git a/include/net/netmem.h b/include/net/netmem.h index a96b3e5e5574c1..06f32a5775bc03 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -439,4 +439,12 @@ static inline void netmem_dma_unmap_page_attrs(struct device *dev, dma_unmap_page_attrs(dev, addr, size, dir, attrs); } +/* Check if the page_pool's memory provider binding uses a DDI + * DMA device. + */ +bool net_devmem_is_ddi(const struct page_pool *pool); + +/* Check if a net_iov's binding uses a DDI DMA device. */ +bool net_devmem_niov_is_ddi(const struct net_iov *niov); + #endif /* _NET_NETMEM_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f21..0d78d71c46ed13 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -207,6 +207,7 @@ enum { NETDEV_A_DMABUF_QUEUES, NETDEV_A_DMABUF_FD, NETDEV_A_DMABUF_ID, + NETDEV_A_DMABUF_DDI, __NETDEV_A_DMABUF_MAX, NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) diff --git a/net/core/devmem.c b/net/core/devmem.c index 69d79aee07ef6f..af7b12acbdc604 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -184,10 +184,12 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, struct device *dma_dev, enum dma_data_direction direction, - unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + unsigned int dmabuf_fd, bool use_ddi, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; + struct device *ddi_device = NULL; static u32 id_alloc_next; struct scatterlist *sg; struct dma_buf *dmabuf; @@ -204,6 +206,15 @@ net_devmem_bind_dmabuf(struct net_device *dev, if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); + if (use_ddi && dev->netdev_ops && dev->netdev_ops->ndo_get_ddi_device) + ddi_device = dev->netdev_ops->ndo_get_ddi_device(dev); + + if (use_ddi && !ddi_device) { + NL_SET_ERR_MSG(extack, "DDI requested but no DDI device available"); + err = -ENODEV; + goto err_put_dmabuf; + } + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, dev_to_node(&dev->dev)); if (!binding) { @@ -224,8 +235,10 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dmabuf = dmabuf; binding->direction = direction; + binding->ddi_device = ddi_device; - binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); + binding->attachment = dma_buf_attach(binding->dmabuf, + ddi_device ?: dma_dev); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); @@ -353,6 +366,27 @@ struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) return binding; } +bool net_devmem_is_ddi(const struct page_pool *pool) +{ + const struct net_devmem_dmabuf_binding *binding; + + if (!pool || !pool->mp_ops || !pool->mp_priv) + return false; + + binding = pool->mp_priv; + return binding->ddi_device != NULL; +} +EXPORT_SYMBOL(net_devmem_is_ddi); + +bool net_devmem_niov_is_ddi(const struct net_iov *niov) +{ + if (!niov) + return false; + + return net_devmem_iov_binding(niov)->ddi_device != NULL; +} +EXPORT_SYMBOL(net_devmem_niov_is_ddi); + void net_devmem_get_net_iov(struct net_iov *niov) { net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); diff --git a/net/core/devmem.h b/net/core/devmem.h index 1c5c18581fcb14..7b35179cbd8c42 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -20,6 +20,10 @@ struct net_devmem_dmabuf_binding { struct dma_buf_attachment *attachment; struct sg_table *sgt; struct net_device *dev; + /* DDI (Data Direct Interface) device for DMA-BUF attachment. + * If set, dma_buf_attach() uses this device instead of dev->dev.parent. + */ + struct device *ddi_device; struct gen_pool *chunk_pool; /* Protect dev */ struct mutex lock; @@ -87,7 +91,8 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, struct device *dma_dev, enum dma_data_direction direction, - unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + unsigned int dmabuf_fd, bool use_ddi, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); @@ -168,7 +173,7 @@ static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, struct device *dma_dev, enum dma_data_direction direction, - unsigned int dmabuf_fd, + unsigned int dmabuf_fd, bool use_ddi, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 07624b682b08b2..6d51428534deac 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -731,6 +731,43 @@ static const struct attribute_group netdev_phys_group = { .is_visible = netdev_phys_is_visible, }; +static ssize_t ddi_device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + struct device *ddi_dev; + + ddi_dev = netdev->netdev_ops->ndo_get_ddi_device(netdev); + if (!ddi_dev) + return -ENODEV; + + return sysfs_emit(buf, "%s\n", dev_name(ddi_dev)); +} +static DEVICE_ATTR_RO(ddi_device); + +static struct attribute *netdev_ddi_attrs[] __ro_after_init = { + &dev_attr_ddi_device.attr, + NULL, +}; + +static umode_t netdev_ddi_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct net_device *netdev = to_net_dev(dev); + + if (!netdev->netdev_ops->ndo_get_ddi_device || + !netdev->netdev_ops->ndo_get_ddi_device(netdev)) + return 0; + + return attr->mode; +} + +static const struct attribute_group netdev_ddi_group = { + .attrs = netdev_ddi_attrs, + .is_visible = netdev_ddi_is_visible, +}; + static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2350,6 +2387,7 @@ int netdev_register_kobject(struct net_device *ndev) *groups++ = &netstat_group; *groups++ = &netdev_phys_group; + *groups++ = &netdev_ddi_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ba673e81716f6e..76805b1fd27475 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -86,10 +86,11 @@ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE }; /* NETDEV_CMD_BIND_RX - do */ -static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_DDI + 1] = { [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), + [NETDEV_A_DMABUF_DDI] = { .type = NLA_U8, }, }; /* NETDEV_CMD_NAPI_SET - do */ @@ -102,9 +103,10 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED }; /* NETDEV_CMD_BIND_TX - do */ -static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_DDI + 1] = { [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_DDI] = { .type = NLA_U8, }, }; /* Ops table for netdev */ @@ -188,7 +190,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_BIND_RX, .doit = netdev_nl_bind_rx_doit, .policy = netdev_bind_rx_nl_policy, - .maxattr = NETDEV_A_DMABUF_FD, + .maxattr = NETDEV_A_DMABUF_DDI, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -202,7 +204,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_BIND_TX, .doit = netdev_nl_bind_tx_doit, .policy = netdev_bind_tx_nl_policy, - .maxattr = NETDEV_A_DMABUF_FD, + .maxattr = NETDEV_A_DMABUF_DDI, .flags = GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 470fabbeacd9bd..375f5628b318a0 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -951,6 +951,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + bool use_ddi = info->attrs[NETDEV_A_DMABUF_DDI] && + nla_get_u8(info->attrs[NETDEV_A_DMABUF_DDI]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) @@ -1002,7 +1004,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) } binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, - dmabuf_fd, priv, info->extack); + dmabuf_fd, use_ddi, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_rxq_bitmap; @@ -1060,6 +1062,8 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + bool use_ddi = info->attrs[NETDEV_A_DMABUF_DDI] && + nla_get_u8(info->attrs[NETDEV_A_DMABUF_DDI]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) @@ -1097,7 +1101,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) dma_dev = netdev_queue_get_dma_dev(netdev, 0); binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, - dmabuf_fd, priv, info->extack); + dmabuf_fd, use_ddi, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev;