mcm proxy: push WR from MIC to host with scif mmap memory instead of scif_send.
authorAmir Hanania <amir.hanania@intel.com>
Thu, 26 May 2016 21:28:32 +0000 (14:28 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Thu, 26 May 2016 21:28:32 +0000 (14:28 -0700)
Mapping host memory to the MIC. Use this memory, in a ring buffer way,
to send the post send work requests from MIC to host. This is replacing
the scif_send to scif_recv and the recv data FD event mechanism.
Since there is no use of FD to wake up the host proxy service,
the host needs to run in polling mode to use this option.

How to run the host in polling mode:

By default, the proxy is now running in polling mode.
You can verify that it is the case in the mpxyd.log file.
Or, edit the mpxyd.conf file: set mcm_affinity to 2.

This optimization improves small message latencies on MFO
devices by as much as 50%.

Signed-off-by: Amir Hanania <amir.hanania@intel.com>
Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/openib_common/dapl_mic_common.h
dapl/openib_common/util.c
dapl/openib_mcm/dapl_ib_util.h
dapl/openib_mcm/mix.c
dapl/svc/mix.c
dapl/svc/mpxyd.c
dapl/svc/mpxyd.h
doc/mpxyd.conf

index 0231013..4c80b41 100755 (executable)
@@ -67,6 +67,7 @@
 #define ALIGN_PAGE(o)  ((o + 4096 - 1) & ~(4096-1))
 #define ALIGN_UP_PPAGE(o) ((((uintptr_t)o) + 4096 - 1)& ~(4096-1))
 #define ALIGN_DOWN_PPAGE(o) ((((uintptr_t)o)) & ~(4096-1))
+#define ALIGN_64_PAD(o) (ALIGN_64(o) - (o))
 
 static inline char * mcm_qp_state_str(IN int st)
 {
@@ -317,6 +318,8 @@ typedef enum dat_mix_ops
        MIX_PZ_FREE,
        MIX_QUERY_DEVICE,
        MIX_QUERY_PORT,
+       MIX_MMAP_ALLOC,
+       MIX_MMAP_FREE,
        MIX_LAST_OP,    /* Keep last */
 
 } dat_mix_ops_t;
@@ -358,6 +361,8 @@ static inline char * mix_op_str(IN int op)
                "PZ_FREE",
                "QUERY_DEVICE",
                "QUERY_PORT",
+               "MMAP_ALLOC",
+               "MMAP_FREE",
        };
        return ((op < 2 || op >= MIX_LAST_OP) ? "Invalid OP?" : mix_ops[op]);
 }
@@ -428,6 +433,8 @@ typedef struct dat_mix_dev_attr
 
 }  __attribute__((packed)) dat_mix_dev_attr_t;
 
+#define DAT_MIX_MMAP_CAP ( 1 << 0 )
+
 /**** MIX attributes, 120 bytes *****/
 typedef struct dat_mix_prov_attr
 {
@@ -450,8 +457,8 @@ typedef struct dat_mix_prov_attr
        uint8_t                 gid_idx;
        uint32_t                cpu_model;
        uint32_t                cpu_family;
-       uint8_t                 resv[31];
-
+       uint8_t                 cap;
+       uint8_t                 resv[30];
 }  __attribute__((packed)) dat_mix_prov_attr_t;
 
 /***** MIX open, device address info returned */
@@ -749,6 +756,24 @@ typedef struct dat_mix_sr
 
 }  __attribute__((packed)) dat_mix_sr_t;
 
+/* DAT_MIX_MMAP_CAP support: fast post_send via scif MMAP */
+typedef struct dat_mix_mmap_addr
+{
+       dat_mix_hdr_t           hdr;
+       off_t                   addr;
+}  __attribute__((packed)) dat_mix_mmap_addr_t;
+
+#define DAT_MIX_MMAP_WR_MAX 8
+
+typedef struct dat_mix_mmap_wr
+{
+       dat_mix_sr_t msg;
+       uint8_t inline_data[DAT_MIX_INLINE_MAX];
+       volatile uint32_t flags;
+       uint8_t tpad[ALIGN_64_PAD(sizeof(dat_mix_sr_t) + DAT_MIX_INLINE_MAX + sizeof(uint32_t))];
+
+}  __attribute__((packed)) dat_mix_mmap_wr_t;
+
 typedef union dat_mix_msg
 {
        dat_mix_open_t          op;
@@ -763,8 +788,9 @@ typedef union dat_mix_msg
        dat_mix_wc_t            wc;
        dat_mix_wr_t            wr;
        dat_mix_dto_comp_t      dto;
-       dat_mix_sr_t            sr;
-
+       dat_mix_sr_t            sr;
+       dat_mix_mmap_addr_t     mm_addr;
+       dat_mix_mmap_wr_t       mm_wr;
 } DAT_MIX_MSG;
 
 #define DAT_MIX_MSG_MAX  sizeof(DAT_MIX_MSG)
index 557c7a7..b24b26b 100644 (file)
@@ -415,6 +415,14 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr,
                        }
                        sprintf(tp->fam_str, "%d", tp->pr_attr.cpu_family);
                        sprintf(tp->mod_str, "%d", tp->pr_attr.cpu_model);
+
+                       /* scif_mmap post_sends MIC->HST if supported */
+                       if (tp->pr_attr.cap & DAT_MIX_MMAP_CAP) {
+                               if (dapli_mix_mmap_alloc(tp)) {
+                                       dapl_log(DAPL_DBG_TYPE_WARN,
+                                                "mmap ERR: run compat mode\n");
+                               }
+                       }
                }
 #else
                tp->na.mode = "DIRECT";
index e2a00ae..edfac7d 100644 (file)
@@ -140,6 +140,15 @@ typedef struct _ib_hca_transport
        ib_named_attr_t         na;
        dat_mix_prov_attr_t     pr_attr;        /* attributes from proxy */
 
+       /* Direct memory mapping for post_send WR entries, MIC to HOST */
+       dat_mix_mmap_wr_t       *mm_s_addr;             /* sbuf for post_send WR to host with MMAP */
+       int                     mm_s_head;              /* next location in PEER array to write WR */
+       dat_mix_mmap_wr_t       *mm_s_peer_addr;        /* writes goto remote MMAP address on host */
+       off_t                   mm_s_peer_addr_off;     /* PEER scif registered memory for sbuf WR array */
+       int                     *mm_s_place_holder;     /* local not used local memory for mmap */
+       volatile int            *mm_r_addr;             /* host mmap this rbuf mem and updates tail */
+       off_t                   mm_r_addr_off;          /* SCIF registration for rbuf, host writes */
+
 } ib_hca_transport_t;
 
 /* prototypes */
@@ -169,6 +178,8 @@ void mcm_destroy_pi_cq(struct dcm_ib_qp *m_qp);
 /* MIC eXchange (MIX) operations, mix.c */
 int  dapli_mix_mode(ib_hca_transport_t *tp, char *name);
 int  dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query);
+int  dapli_mix_mmap_free(ib_hca_transport_t *tp, uint8_t stat);
+int  dapli_mix_mmap_alloc(ib_hca_transport_t *tp);
 void dapli_mix_close(ib_hca_transport_t *tp);
 int  dapli_mix_get_attr(ib_hca_transport_t *tp, dat_mix_prov_attr_t *pr_attr);
 int  dapli_mix_query_device(ib_hca_transport_t *tp, struct ibv_device_attr *dev_attr);
index bfbb889..e946292 100644 (file)
@@ -70,6 +70,273 @@ int dapli_mix_mode(ib_hca_transport_t *tp, char *name)
        return 0;
 }
 
+
+/*
+ * MIX_MMAP_FREE
+ */
+int dapli_mix_mmap_free(ib_hca_transport_t *tp, uint8_t stat)
+{
+       dat_mix_mmap_addr_t msg;
+       int len, ret = 0;
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION," MIX_MMAP_FREE\n");
+
+       if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && tp->scif_ep) {
+
+               msg.hdr.ver = DAT_MIX_VER;
+               msg.hdr.op = MIX_MMAP_FREE;
+               msg.hdr.status = stat;
+               msg.hdr.flags = MIX_OP_REQ;
+               msg.hdr.req_id = dapl_os_getpid();
+
+               len = sizeof(dat_mix_mmap_addr_t);
+               ret = scif_send(tp->scif_ep, &msg, len, SCIF_SEND_BLOCK);
+               if (ret != len) {
+                       dapl_log(DAPL_DBG_TYPE_ERR,
+                                " scif_send ERR %s ret %d, exp %d, err %s\n",
+                                mix_op_str(msg.hdr.op), ret, len,
+                                strerror(errno));
+                       return -1;
+               }
+               dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                        " %s ep %d, req_id 0x%x\n",
+                        mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
+
+               /* wait to other side to set "no access" to our local memory */
+               ret = scif_recv(tp->scif_ep, &msg, len, SCIF_RECV_BLOCK);
+               if (ret != len) {
+                       dapl_log(DAPL_DBG_TYPE_ERR,
+                                " scif_recv ERR %s ret %d, exp %d, err %s\n",
+                                mix_op_str(msg.hdr.op), ret,
+                                len, strerror(errno));
+                       return -1;
+               }
+
+               if (msg.hdr.op != MIX_MMAP_FREE ||
+                   msg.hdr.flags != MIX_OP_RSP ||
+                   msg.hdr.status != MIX_SUCCESS) {
+                       dapl_log(DAPL_DBG_TYPE_ERR,
+                                " reply ERR %s, flags 0x%x, stat 0x%x\n",
+                                mix_op_str(msg.hdr.op),
+                                msg.hdr.flags, msg.hdr.status);
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+
+/*
+ * MIX_MMAP_ALLOC
+ */
+int dapli_mix_mmap_alloc(ib_hca_transport_t *tp)
+{
+       dat_mix_mmap_addr_t msg;
+       int len, ret;
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION," MIX_MMAP_ALLOC\n");
+
+       if(!tp->mm_s_addr) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                        " WARN: mmap_init err - don't send mmap info\n");
+               return -1;
+       }
+
+       msg.hdr.ver = DAT_MIX_VER;
+       msg.hdr.op = MIX_MMAP_ALLOC;
+       msg.hdr.status = 0;
+       msg.hdr.flags = MIX_OP_REQ;
+       msg.hdr.req_id = dapl_os_getpid();
+       msg.addr = tp->mm_r_addr_off;
+
+       len = sizeof(dat_mix_mmap_addr_t);
+       ret = scif_send(tp->scif_ep, &msg, len, SCIF_SEND_BLOCK);
+       if (ret != len) {
+               dapl_log(DAPL_DBG_TYPE_ERR,
+                        " ERR: %s send on %d, ret %d, exp %d, error %s\n",
+                        mix_op_str(msg.hdr.op),tp->scif_ep, ret,
+                        len, strerror(errno));
+               goto remote_err;
+       }
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " Sent %s request on SCIF EP %d, req_id 0x%x\n",
+                mix_op_str(msg.hdr.op), tp->scif_ep, ntohl(msg.hdr.req_id));
+
+       /* MIX_SEND_OP_ADDR_EXG: reply includes peer scif address for SEND OP buffer */
+       ret = scif_recv(tp->scif_ep, &msg, len, SCIF_RECV_BLOCK);
+       if (ret != len) {
+               dapl_log(1, " ERR: send_op_addr_exg ep %d, ret %d, exp %d, error %s\n",
+                           tp->scif_ep, ret, len, strerror(errno));
+               goto remote_err;
+       }
+
+       if (msg.addr == SCIF_REGISTER_FAILED || msg.hdr.op != MIX_MMAP_ALLOC ||
+               msg.hdr.flags != MIX_OP_RSP || msg.hdr.status != MIX_SUCCESS) {
+               dapl_log(1, " ERR: send op exg: op %s, flags 0x%x, stat 0x%x peer addr 0x%llx\n",
+                            mix_op_str(msg.hdr.op), msg.hdr.flags, msg.hdr.status, msg.addr);
+               goto remote_err;
+       }
+
+       tp->mm_s_peer_addr_off = msg.addr; /* scif_off from proxy host, WR array */
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " Recv'd %s reply on SCIF EP %d, dev_id %d is 0x%llx\n",
+                mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id, msg.addr);
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " s_off 0x%llx, r_off 0x%llx, peer_head = 0x%x\n",
+                tp->mm_s_peer_addr_off, tp->mm_r_addr_off, *tp->mm_r_addr);
+
+       /* mmap host memory, dat_mix_mmap_wr_t WR array, to write as local memory */
+       ret = posix_memalign((void **)&tp->mm_s_place_holder, 4096,
+                             ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)));
+       if (ret) {
+               dapl_log(DAPL_DBG_TYPE_ERR,
+                        " ERR: send op exg: alloc mmap_place_holder. %d\n",
+                        strerror(errno));
+               goto local_err;
+       }
+
+       tp->mm_s_peer_addr = (dat_mix_mmap_wr_t *)
+               scif_mmap(&tp->mm_s_place_holder,
+                         ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)),
+                         SCIF_PROT_READ | SCIF_PROT_WRITE,
+                         0, tp->scif_ep,
+                         tp->mm_s_peer_addr_off);
+
+       if (tp->mm_s_peer_addr == SCIF_MMAP_FAILED) {
+               dapl_log(DAPL_DBG_TYPE_ERR, " ERR: send op exg: Failed to mmap peer memory");
+               goto local_err;
+       }
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " mm_s_place_holder %p, mm_s_peer_addr %p\n",
+                tp->mm_s_place_holder, tp->mm_s_peer_addr);
+
+
+       return 0;
+
+local_err:
+       if (tp->mm_s_place_holder)
+               free(tp->mm_s_place_holder);
+
+       dapli_mix_mmap_free(tp, MIX_ENOMEM); /* Send abort to host */
+
+remote_err:
+       tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+       tp->mm_s_peer_addr = NULL;
+
+       return -1;
+}
+
+
+/*
+ * Allocate and register buffers needed for scif_mmap and fast post_send WR's
+ */
+static int mix_mmap_init(ib_hca_transport_t *tp)
+{
+       int ret, len;
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION," mix_mmap_init\n");
+
+       tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+       tp->mm_s_peer_addr = NULL;
+
+       len = ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t));
+       ret = posix_memalign((void **)&tp->mm_s_addr, 4096, len);
+       if (ret) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                        "mmap_init: ERR sbuf alloc - %s\n", strerror(errno));
+               tp->mm_s_addr = NULL;
+               goto err;
+       }
+       memset(tp->mm_s_addr, 0, len);
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " mmap_init: sbuf %p ln %d\n", tp->mm_s_addr, len);
+
+       len = ALIGN_PAGE(sizeof(uint32_t));
+       ret = posix_memalign((void **)&tp->mm_r_addr, 4096, len);
+       if (ret) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                        "mmap_init: ERR rbuf alloc - %s\n", strerror(errno));
+               goto err1;
+       }
+       memset((void *)tp->mm_r_addr, 0, len);
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " mmap_init: rbuf %p ln %d\n", tp->mm_r_addr, len);
+
+       tp->mm_r_addr_off =
+               scif_register(tp->scif_ep, (void *)tp->mm_r_addr, len,
+                             (off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
+
+       if (tp->mm_r_addr_off == SCIF_REGISTER_FAILED) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                        "mmap_init: ERR scif_reg - %s\n", strerror(errno));
+               goto err2;
+       }
+
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " mmap_init: success - rbuf scif registered off = 0x%llx\n",
+                tp->mm_r_addr_off);
+
+       tp->mm_s_head = 0;
+       return 0;
+
+err2:
+       free((void *)tp->mm_r_addr);
+       tp->mm_r_addr = NULL;
+
+err1:
+       free((void*)tp->mm_s_addr);
+       tp->mm_s_addr = NULL;
+
+err:
+       return -1;
+}
+
+/*
+ * Free the post_send WR data structures needed for direct scif mmap
+ */
+static void mix_mmap_free(ib_hca_transport_t *tp)
+{
+       dapl_log(DAPL_DBG_TYPE_EXTENSION," Clean send OP\n");
+
+       tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+
+       if (tp->mm_s_peer_addr) {
+               scif_munmap((void *)tp->mm_s_peer_addr,
+                               ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)));
+               tp->mm_s_peer_addr = NULL;
+       }
+
+       if (tp->mm_s_place_holder) {
+               free(tp->mm_s_place_holder);
+               tp->mm_s_place_holder = NULL;
+       }
+
+       /* unmap host before free local memory */
+       dapli_mix_mmap_free(tp, MIX_SUCCESS);
+
+       /* Make sure to unmap this memry at host before unregister and free */
+       if (tp->scif_ep && tp->mm_r_addr_off > 0) {
+               scif_unregister(tp->scif_ep, tp->mm_r_addr_off, ALIGN_PAGE(sizeof(uint32_t)));
+               tp->mm_r_addr_off = SCIF_REGISTER_FAILED;
+       }
+
+       if (tp->mm_s_addr) {
+               free(tp->mm_s_addr);
+               tp->mm_s_addr = NULL;
+       }
+
+       if(tp->mm_r_addr) {
+               free((void *)tp->mm_r_addr);
+               tp->mm_r_addr = NULL;
+       }
+}
+
 /*
  * MIX_IA_OPEN
  */
@@ -214,8 +481,9 @@ int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query_only)
                            tp->scif_ep, ret, len, strerror(errno));
                return -1;
        }
-       dapl_log(DAPL_DBG_TYPE_EXTENSION," Recv'd %s reply on SCIF EP %d, dev_id %d\n",
-                               mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
+       dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                " Recv'd %s reply on SCIF EP %d, dev_id %d\n",
+                mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
 
        if (msg.hdr.ver != DAT_MIX_VER || msg.hdr.op != MIX_IA_OPEN ||
            msg.hdr.flags != MIX_OP_RSP || msg.hdr.status != MIX_SUCCESS) {
@@ -244,10 +512,15 @@ int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query_only)
        tp->ib_cm.mtu = msg.dev_attr.mtu; /* proxy sets active_MTU mode */
        tp->dev_id = msg.hdr.req_id;
 
+       /* We do not use this var in MFO, but use it as a flag to signal success */
        if (MFO_EP(&tp->addr))
-               /* We do not use this var in MFO, but use it as a flag to signal success */
                tp->ib_ctx = (struct ibv_context *)0xdeadbeef;
 
+       if (mix_mmap_init(tp)) {
+               dapl_log(DAPL_DBG_TYPE_WARN,
+                        " WARN: init mmap for send_op failed\n");
+       }
+
        dapl_log(DAPL_DBG_TYPE_EXTENSION,
                 " mix_open reply (msg %p, ln %d) EPs %d %d %d - dev_id %d lid 0x%x\n",
                 &msg, len, tp->scif_ep, tp->scif_ev_ep,
@@ -263,6 +536,8 @@ void dapli_mix_close(ib_hca_transport_t *tp)
                 " MIX_IA_CLOSE: tp %p scif EP's %d,%d,%d dev_id %d\n",
                 tp, tp->scif_ep, tp->scif_tx_ep, tp->scif_ev_ep, tp->dev_id);
 
+       mix_mmap_free(tp);
+
        if (tp->scif_ep) {
                scif_close(tp->scif_ep);
                tp->scif_ep = 0;
@@ -1022,14 +1297,16 @@ static inline int mix_proxy_data(ib_qp_handle_t m_qp, dat_mix_sr_t *msg, struct
 int dapli_mix_post_send(ib_qp_handle_t m_qp, int txlen, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
 {
        char cmd[DAT_MIX_MSG_MAX + DAT_MIX_INLINE_MAX];
-       dat_mix_sr_t *msg = (dat_mix_sr_t *)cmd;
+       dat_mix_sr_t *msg = (dat_mix_sr_t *) cmd;
        scif_epd_t mix_ep = m_qp->tp->scif_ep;
-       int ret, i, offset = sizeof(dat_mix_sr_t);
+       int ret, i, stall, off = sizeof(dat_mix_sr_t);
+       ib_hca_transport_t *tp = m_qp->tp;
+       dat_mix_mmap_wr_t *mm_addr;
 
-       dapl_log(DAPL_DBG_TYPE_EXTENSION,
-               " mix_post_send: msg=%p sge=%d len=%d op=%d off=%d (%p)raddr %Lx rkey 0x%x, wr_id %LX\n",
-                msg, wr->num_sge, txlen, wr->opcode, offset, &wr->wr.rdma.remote_addr,
-                wr->wr.rdma.remote_addr, wr->wr.rdma.rkey, wr->wr_id);
+       if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+               msg = &tp->mm_s_addr[tp->mm_s_head].msg;
+               tp->mm_s_addr[tp->mm_s_head].flags = 0;
+       }
 
        if (wr->opcode != IBV_WR_SEND &&
            wr->opcode != IBV_WR_RDMA_WRITE &&
@@ -1051,21 +1328,47 @@ int dapli_mix_post_send(ib_qp_handle_t m_qp, int txlen, struct ibv_send_wr *wr,
        } else {
                msg->hdr.flags |= MIX_OP_INLINE;
                for (i=0; i < wr->num_sge; i++) {
-                       memcpy(&cmd[offset], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
-                       offset += wr->sg_list[i].length;
+                       if(tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+                               memcpy(&((char *)msg)[off], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
+                       } else {
+                               memcpy(&cmd[off], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
+                       }
+                       off += wr->sg_list[i].length;
                }
        }
 
-       ret = scif_send(mix_ep, msg, offset, SCIF_SEND_BLOCK);
-       if (ret != offset) {
-               dapl_log(1, " ERR: %s on %d, ret %d, exp %d, error %s\n",
-                        mix_op_str(msg->hdr.op), mix_ep, ret,
-                        offset, strerror(errno));
-               return -1;
-       }
+       if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+               stall=0;
+               while (((tp->mm_s_head + 1) % DAT_MIX_MMAP_WR_MAX) == *tp->mm_r_addr) {
+                       if(!stall) {
+                               dapl_log(DAPL_DBG_TYPE_EXTENSION,
+                                        "post_send mmap: WR qfull. hd %d tl %d\n",
+                                        tp->mm_s_head, *tp->mm_r_addr);
+                       }
+                       stall++;
+                       usleep(1);
+               }
 
-       dapl_log(DAPL_DBG_TYPE_EXTENSION," Sent MIX_SEND on SCIF EP %d, mlen=%d\n", mix_ep, offset);
+               /* Copy WR + inline via mmap, sync data, notify peer */
+               mm_addr = tp->mm_s_peer_addr + tp->mm_s_head;
+
+               memcpy((void *)mm_addr, (void *)msg, ALIGN_64(off));
+                __sync_synchronize();
+
+                *((uint32_t *)(((char *)mm_addr) + offsetof(dat_mix_mmap_wr_t, flags))) = 1;
+                tp->mm_s_head = (tp->mm_s_head + 1) % DAT_MIX_MMAP_WR_MAX; /* next */
+
+       } else {
+               ret = scif_send(mix_ep, msg, off, SCIF_SEND_BLOCK);
+               if (ret != off) {
+                       dapl_log(1, " ERR: %s on %d, ret %d, exp %d, error %s\n",
+                                mix_op_str(msg->hdr.op), mix_ep, ret,
+                                off, strerror(errno));
+                       return -1;
+               }
+       }
        return 0;
+
 }
 
 int dapli_mix_post_recv(ib_qp_handle_t m_qp, int len, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr)
index 38e7599..ebb1497 100644 (file)
@@ -80,6 +80,7 @@ static void mix_get_prov_attr(mcm_scif_dev_t *smd, dat_mix_prov_attr_t *pr_attr)
        pr_attr->system_guid = system_guid;
        pr_attr->cpu_model = mcm_cpu_model;
        pr_attr->cpu_family = mcm_cpu_family;
+       pr_attr->cap |= DAT_MIX_MMAP_CAP;
 }
 
 /* close MCM device, MIC client, md->slock held */
@@ -677,6 +678,133 @@ resp:
        return (scif_send_msg(smd->scif_op_ep, (void*)pmsg, len));
 }
 
+
+static int mix_mmap_free(mcm_scif_dev_t *smd, dat_mix_mmap_addr_t *pmsg)
+{
+       int ret, len;
+
+       /* hdr already read, get operation data */
+       len = sizeof(dat_mix_mmap_addr_t) - sizeof(dat_mix_hdr_t);
+       ret = scif_recv(smd->scif_op_ep, ((char*)pmsg + sizeof(dat_mix_hdr_t)), len, SCIF_RECV_BLOCK);
+       if (ret != len) {
+               mlog(0, " ERR: ret %d, exp %d %s\n", ret, len, strerror(errno));
+               return ret;
+       }
+
+       if (pmsg->hdr.status != MIX_SUCCESS ) {
+               /* MIC could not init after host ACK ADDR_EXG - fall back to scif_send */
+               smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+               mlog(0, " WARN: MIC failed to init SEND_OP via mmap memory\n");
+       }
+
+       pmsg->hdr.status = MIX_SUCCESS;
+
+       /* MIC side want us not to access it's memory */
+       if (smd->mm_s_peer_addr) {
+               ret = scif_munmap((void *)smd->mm_s_peer_addr, ALIGN_PAGE(sizeof(int)));
+               if (ret < 0) {
+                       mlog(0, " ERR: scif_munmap %s\n", strerror(errno));
+                       pmsg->hdr.status = MIX_EINVAL;
+               }
+       }
+
+       /* Valid s_peer_addr and NULL s_mmap_addr mark that we stopped using mmap memory
+        * we are closing down but that will keep process pending SEND_OP */
+       smd->mm_s_peer_addr = NULL;
+
+       if (smd->mm_s_place_holder) {
+               free(smd->mm_s_place_holder);
+               smd->mm_s_place_holder = NULL;
+       }
+
+       pmsg->hdr.flags = MIX_OP_RSP;
+       len = sizeof(dat_mix_mmap_addr_t);
+
+       /* send back response */
+       if (smd->scif_op_ep) {
+               ret = scif_send_msg(smd->scif_op_ep, (void*)pmsg, len);
+               if (ret != len) {
+                       mlog(0, " ERR: ret %d, exp %d %s\n",
+                               ret, len, strerror(errno));
+                       return ret;
+               }
+       }
+
+       return ret;
+}
+
+
+static int mix_mmap_alloc(mcm_scif_dev_t *smd, dat_mix_mmap_addr_t *pmsg)
+{
+       int ret, len;
+
+       /* hdr already read, get operation data */
+       len = sizeof(dat_mix_mmap_addr_t) - sizeof(dat_mix_hdr_t);
+       ret = scif_recv(smd->scif_op_ep, ((char*)pmsg + sizeof(dat_mix_hdr_t)), len, SCIF_RECV_BLOCK);
+       if (ret != len) {
+               mlog(0, " ERR: ret %d, exp %d %s\n", ret, len, strerror(errno));
+               return ret;
+       }
+
+       pmsg->hdr.status = MIX_EINVAL;
+
+       mlog(8, " mm_s_peer_addr_off from MIC 0x%llx\n", pmsg->addr);
+
+       if (pmsg->addr == SCIF_REGISTER_FAILED) {
+               mlog(0, " ERR: op send got invalid input 0x%llx\n", pmsg->addr);
+               goto resp;
+       }
+
+       if(!smd->mm_r_addr) {
+               /* init_smd_send_op_mmap failed - fall back to reg OP */
+               mlog(8, " init_smd_send_op_mmap failed - send MIC ENOMEM\n");
+               pmsg->hdr.status = MIX_ENOMEM;
+               goto resp;
+       }
+
+       smd->mm_s_peer_addr_off = pmsg->addr;
+
+       ret = posix_memalign((void **)&smd->mm_s_place_holder, 4096, ALIGN_PAGE(sizeof(int)));
+       if (ret) {
+               mlog(0, " ERR: alloc mm_s_place_holder, ret=%d\n", ret);
+               smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+               smd->mm_s_place_holder = NULL;
+               goto resp;
+       }
+
+       /* mmap peer buffer so we can write into it like a reg memory */
+       smd->mm_s_peer_addr = (volatile int *)
+               scif_mmap(smd->mm_s_place_holder,
+                         ALIGN_PAGE(sizeof(int)),
+                         SCIF_PROT_READ | SCIF_PROT_WRITE, 0,
+                         smd->scif_op_ep, smd->mm_s_peer_addr_off);
+
+       if (smd->mm_s_peer_addr == SCIF_MMAP_FAILED) {
+               smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+               smd->mm_s_peer_addr = NULL;
+               free(smd->mm_s_place_holder);
+               smd->mm_s_place_holder = NULL;
+               mlog(0, " ERR: scif_mmap m_s_peer_addr %s", strerror(errno));
+               goto resp;
+       }
+
+       /* Send other side our scif base address of the SNED OP ARRAY */
+       pmsg->addr = smd->mm_r_addr_off;
+       pmsg->hdr.status = MIX_SUCCESS;
+
+       /* Initialize the other side, WR tail */
+       *smd->mm_s_peer_addr = 0;
+
+       mlog(8, " mmap done: mm_s_peer_off 0x%llx, mm_r_addr_off 0x%llx\n",
+               smd->mm_s_peer_addr_off, smd->mm_r_addr_off);
+resp:
+       /* send back response */
+       pmsg->hdr.flags = MIX_OP_RSP;
+       len = sizeof(dat_mix_mmap_addr_t);
+
+       return (scif_send_msg(smd->scif_op_ep, (void*)pmsg, len));
+}
+
 /* create new proxy-out PZ */
 static int mix_pz_create(mcm_scif_dev_t *smd, dat_mix_pz_t *pmsg)
 {
@@ -2251,10 +2379,6 @@ static int mix_proxy_out(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, mcm_qp_t *m_qp
        m_wr->wr.sg_list = m_wr->sg;
        m_wr->wr.num_sge = len ? 1:0;
 
-       mlog(4, " INLINE m_wr[%d] %p raddr %p rkey 0x%x, ib_wr raddr %p rkey 0x%x %d bytes\n",
-               m_qp->wr_hd, m_wr, pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
-               m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey, len);
-
        /* M_WR */
        m_wr->org_id = pmsg->wr.wr_id;
        m_wr->m_idx = 0;
@@ -2338,25 +2462,22 @@ retry_mr:
                        mpxy_unlock(&smd->tblock);
                        goto bail;
                }
-               mlog(0x10, "[%d:%d:%d] %s_INLINE_post_sig: qp %p wr %p wr_id %p flgs 0x%x,"
-                       " pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n",
-                       m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
-                       m_qp->r_entry.tid,
-                       (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
-                       m_qp, m_wr, m_wr->wr.wr_id, m_wr->wr.send_flags,
-                       m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
-                       m_wr->wr.sg_list->length, m_wr->m_idx);
        }
        mpxy_unlock(&smd->tblock);
 
        if (len) {
                /* copy data into proxy buffer, signal TX thread via wr_id */
-               ret = scif_recv(smd->scif_op_ep, (void*)m_wr->sg->addr, len, SCIF_RECV_BLOCK);
-               if (ret != len) {
-                       mlog(0, " ERR: scif_recv inline DATA, ret %d, exp %d\n", ret, len);
-                       ret = errno;
-                       len = 0;
-                       goto bail;
+               if (smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && pmsg->hdr.op == MIX_SEND) {
+                       /* inline data is after the msg */
+                       memcpy((void*)m_wr->sg->addr, pmsg + 1, len);
+               } else {
+                       ret = scif_recv(smd->scif_op_ep, (void*)m_wr->sg->addr, len, SCIF_RECV_BLOCK);
+                       if (ret != len) {
+                               mlog(0, " ERR: scif_recv inline DATA, ret %d, exp %d\n", ret, len);
+                               ret = errno;
+                               len = 0;
+                               goto bail;
+                       }
                }
        }
 
@@ -2374,8 +2495,10 @@ bail:
                struct dat_mix_wc wc;
                char dbuf[DAT_MIX_INLINE_MAX];
 
-               if (len) /* drain inline data */
+               if (len && !(smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && pmsg->hdr.op == MIX_SEND)) {
+                        /* drain inline data */
                        scif_recv(smd->scif_op_ep, dbuf, len, SCIF_RECV_BLOCK);
+               }
 
                wc.wr_id = pmsg->wr.wr_id;
                wc.byte_len = len;
@@ -2388,7 +2511,7 @@ bail:
        return ret;
 }
 
-/* Post SEND message request, IB send or rdma write, operation channel */
+/* Post SEND message request, IB send or rdma write, operation channel Via scif send*/
 static int mix_post_send(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg)
 {
        int len, ret;
@@ -2434,6 +2557,58 @@ static int mix_post_send(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg)
        return (mix_proxy_out(smd, pmsg, m_qp));
 }
 
+/* Post SEND message request, IB send or rdma write, operation channel Via scif mmap memory */
+int mix_post_send_ext(mcm_scif_dev_t *smd)
+{
+       int ret, retry, max_io = 32;
+       struct mcm_qp *m_qp;
+       dat_mix_mmap_wr_t *mm_wr_entry;
+       volatile dat_mix_sr_t *pmsg;
+
+       while (max_io--) {
+               retry = 100;
+               mm_wr_entry = &smd->mm_r_addr[smd->mm_r_head];
+               pmsg = &mm_wr_entry->msg;
+
+               /* wait little for SEND OP msg */
+               while(retry-- && !mm_wr_entry->flags) {
+                       if(!retry)
+                               return 0;
+                       sched_yield();
+               }
+
+               if (pmsg->hdr.op != MIX_SEND) {
+                       mlog(0, " ERR: no MIX_SEND OP CODE? Got %d, exp %d\n",
+                               pmsg->hdr.op, MIX_SEND);
+                       return -1;
+               }
+
+               /* get QP by ID */
+               m_qp = mix_get_qp(smd, pmsg->qp_id);
+               if (!m_qp || !m_qp->ib_qp2) {
+                       mlog(0, " ERR: mix_get_qp id %d not found\n",
+                               pmsg->qp_id);
+                       return POLLERR; /* async err, no QP to report */
+               }
+
+               ret = mix_proxy_out(smd, (dat_mix_sr_t *)pmsg, m_qp);
+
+               /* Mark entry empty, update local head and MIC head */
+               mm_wr_entry->flags = 0;
+               smd->mm_r_head = ((smd->mm_r_head + 1) % smd->mm_r_last);
+
+               if(smd->mm_s_peer_addr)
+                       *smd->mm_s_peer_addr = smd->mm_r_head;
+
+               if (ret) {
+                       mlog(0, " ERR: failed mix proxy out. ret %d\n", ret);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 /* Post RECV message request on Proxy-RX channel */
 static int mix_post_recv(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg)
 {
@@ -2582,6 +2757,12 @@ int mix_scif_recv(mcm_scif_dev_t *smd, scif_epd_t scif_ep)
        case MIX_CM_DISC:
                ret = mix_cm_disc_out(smd, (dat_mix_cm_t *)phdr, scif_ep);
                break;
+       case MIX_MMAP_ALLOC:
+               ret = mix_mmap_alloc(smd, (dat_mix_mmap_addr_t *)phdr);
+               break;
+       case MIX_MMAP_FREE:
+               ret = mix_mmap_free(smd, (dat_mix_mmap_addr_t *)phdr);
+               break;
        case MIX_CM_DREP:
        default:
                mlog(0, " ERR: smd %p unknown msg->op: %d, close dev_id %d\n",
index d6de5be..3df50e8 100644 (file)
@@ -307,6 +307,29 @@ void mcm_destroy_md(struct mcm_ib_dev *md)
        return;
 }
 
+void destroy_smd_send_op_mmap(mcm_scif_dev_t *smd)
+{
+       if (smd->mm_r_addr_off != SCIF_REGISTER_FAILED && smd->scif_op_ep) {
+               scif_unregister(smd->scif_op_ep, smd->mm_r_addr_off, smd->mm_r_len);
+               smd->mm_r_addr_off = SCIF_REGISTER_FAILED;
+       }
+
+       if (smd->mm_r_addr) {
+               free(smd->mm_r_addr);
+               smd->mm_r_addr = NULL;
+       }
+
+       if (smd->mm_s_peer_addr > (int *)0 && smd->scif_op_ep) {
+               scif_munmap((void *)smd->mm_s_peer_addr, ALIGN_PAGE(sizeof(int)));
+               smd->mm_s_peer_addr = NULL;
+       }
+
+       if (smd->mm_s_place_holder) {
+               free(smd->mm_s_place_holder);
+               smd->mm_s_place_holder = NULL;
+       }
+}
+
 void mpxy_destroy_bpool(mcm_scif_dev_t *smd)
 {
        if (smd->m_offset && smd->scif_tx_ep)
@@ -444,6 +467,9 @@ void mpxy_destroy_smd(mcm_scif_dev_t *smd)
        if (smd->ref_cnt)
                mlog(0, " WARNING: ref_cnt not 0, = %d \n", smd->ref_cnt);
 
+       destroy_smd_send_op_mmap(smd);
+       mlog(8, " send op via scif wt destroyed\n");
+
        mpxy_destroy_bpool(smd);
        mlog(8, " proxy buffer pools destroyed \n");
 
@@ -468,6 +494,45 @@ void mpxy_destroy_smd(mcm_scif_dev_t *smd)
        free(smd);
 }
 
+static int init_smd_send_op_mmap(mcm_scif_dev_t *smd)
+{
+       int ret, len;
+
+       smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+       smd->mm_r_addr_off = SCIF_REGISTER_FAILED;
+       smd->mm_s_peer_addr = NULL;
+       smd->mm_s_place_holder = NULL;
+       smd->mm_r_head = 0;
+       smd->mm_r_last = DAT_MIX_MMAP_WR_MAX;
+
+       len = ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * (sizeof(dat_mix_mmap_wr_t)));
+       smd->mm_r_len = len;
+       ret = posix_memalign((void **)&smd->mm_r_addr, 4096, len);
+       if (ret) {
+               mlog(0, " ERR: alloc r_addr ln=%d, %s\n", len, strerror(errno));
+               smd->mm_r_addr = NULL;
+               return -1;
+       }
+       memset(smd->mm_r_addr, 0, len);
+
+       mlog(8, " MMAP send_op: buf %p len %d\n", smd->mm_r_addr, len);
+
+       smd->mm_r_addr_off = scif_register(smd->scif_op_ep, smd->mm_r_addr, len,
+                                     (off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
+
+       if (smd->mm_r_addr_off == SCIF_REGISTER_FAILED) {
+               mlog(0, " ERR: scif_register addr=%p,%d ret=%s\n", smd->mm_r_addr, len, strerror(errno));
+               free(smd->mm_r_addr);
+               smd->mm_r_addr = NULL;
+               return -1;
+       }
+       mlog(8, " MMAP send_op: addr=%p, off=0x%llx, len %d\n",
+               smd->mm_r_addr, smd->mm_r_addr_off, len);
+
+       return 0;
+}
+
+
 static int create_smd_bpool(mcm_scif_dev_t *smd)
 {
        int ret;
@@ -618,6 +683,9 @@ static mcm_scif_dev_t *mcm_create_smd(mcm_ib_dev_t *md, scif_epd_t op_ep, scif_e
        if (!smd->cm_id)
                goto err;
 
+       /* no need to check ret val - in case of failure we fall back to reg OP */
+       init_smd_send_op_mmap(smd);
+
        if (create_smd_bpool(smd))
                goto err;
 
@@ -647,6 +715,8 @@ static mcm_scif_dev_t *mcm_create_smd(mcm_ib_dev_t *md, scif_epd_t op_ep, scif_e
        return smd;
 err:
        if (smd) {
+               if (smd->mm_r_addr)
+                       free(smd->mm_r_addr);
                if (smd->cmd_buf)
                        free(smd->cmd_buf);
                if (smd->ports)
@@ -975,6 +1045,10 @@ void mpxy_tx_thread(void *mic_client)
                }
                mc->tx_busy = data;
                time_ms = (data) ? 0:-1;
+
+               if (time_ms && mcm_op_poll)
+                       time_ms = 0;
+
                mpxy_unlock(&mc->txlock);
                if (time_ms == -1) mlog(0x10," sleep\n");
                mcm_select(set, time_ms);
@@ -1027,6 +1101,13 @@ void mpxy_op_thread(void *mic_client)
                                smd->th_ref_cnt++;
                                mpxy_unlock(&md->slock);
 
+                               ret = 0;
+                               if (smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED)
+                                       ret = mix_post_send_ext(smd); /* mmap operation */
+
+                               if (ret == POLLERR)
+                                       mix_close_device(md, smd);
+
                                ret = mcm_poll(smd->scif_op_ep, POLLIN); /* operations */
                                if (ret == POLLIN)
                                        ret = mix_scif_recv(smd, smd->scif_op_ep);
@@ -1048,13 +1129,19 @@ void mpxy_op_thread(void *mic_client)
                                smd = next;
                        }
                        mpxy_unlock(&md->slock);
-                       sched_yield();
+
+                       if (smd)
+                               sched_yield();
                }
                mpxy_unlock(&mc->oplock);
                /* data-path, loop if busy or device open & single core */
                if ((mc->tx_busy || mc->rx_busy) || (smd_cnt && mcm_op_poll))
                        time_ms = 0;
-               mcm_select(set, time_ms);
+               else {
+                       time_ms = 0;
+               }
+
+               mcm_select(set, time_ms); /* Another sched yield */
                if (time_ms == -1) mlog(0x10," OP wake\n");
                if (mcm_poll(mc->op_pipe[0], POLLIN) == POLLIN)
                        read(mc->op_pipe[0], rbuf, 2);
@@ -1217,6 +1304,10 @@ void mpxy_rx_thread(void *mic_client)
                }
                mc->rx_busy = data;
                time_ms = data ? 0:-1;
+
+               if (time_ms && mcm_op_poll)
+                       time_ms = 0;
+
                mpxy_unlock(&mc->rxlock);
                if (time_ms == -1) mlog(0x10," RX sleep\n");
                mcm_select(set, time_ms);
index 709aa46..34381cf 100644 (file)
@@ -370,6 +370,16 @@ typedef struct mcm_scif_dev {
        char                    *cmd_buf;       /* operation command buffer  */
        struct dat_mix_dev_attr dev_attr;       /* Manage attributes per MIC client open */
        uint8_t                 mtu_env;        /* mtu override with DAPL_IB_MTU */
+
+       dat_mix_mmap_wr_t       *mm_r_addr;     /* Address of post_send WR array, updated from MIC via writes */
+       int                     mm_r_len;       /* total bytes of WR array */
+       off_t                   mm_r_addr_off;  /* WR entry rcv buffer, map from MIC via mmap, SCIF registration */
+       int                     mm_r_head;      /* location for new posted WR entries from MIC */
+       int                     mm_r_last;      /* mmap WR array size */
+       off_t                   mm_s_peer_addr_off;     /* peer scif address for tail update */
+       volatile int            *mm_s_peer_addr;        /* writing to this address is writing to remote mem */
+       int                     *mm_s_place_holder;     /* alloc local memory for scif_mmap, not referenced */
+
 #ifdef MCM_PROFILE
        uint16_t                m_hd_ro;        /* HD,TL tracking */
        uint16_t                m_tl_ro;
@@ -562,6 +572,7 @@ void m_cq_free(struct mcm_cq *m_cq);
 void m_qp_free(struct mcm_qp *m_qp);
 void m_mr_free(struct mcm_mr *m_mr);
 int mix_scif_recv(mcm_scif_dev_t *smd, scif_epd_t scif_ep);
+int mix_post_send_ext(mcm_scif_dev_t *smd);
 int mix_cm_disc_in(mcm_cm_t *m_cm);
 int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len);
 int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len);
index f3fd722..349a1c0 100644 (file)
@@ -58,7 +58,8 @@ scif_listen_qlen 240
 # 
 # The default is 1 
 
-mcm_affinity 2
+mcm_affinity 1
+mcm_op_poll 1
 
 # mcm_affinity_base_mic:
 # Specifies a hard binding for CPU id base value used for affinity support of