Kernel Patches
Create nvme_tcp.h (initiator side)
Create a new file nvme_tcp.h in the include/trace/events directory. This file defines the tracepoints for the nvme-tcp module.
Create nvmet_tcp.h (target side)
Create a new file nvmet_tcp.h in the include/trace/events directory. This file defines the tracepoints for the nvmet-tcp module.
Modify block/blk-core.c (initiator side)
Modify the block/blk-core.c file as folows. (Search the unchanged lines as key words in the file to locate the position)
- Export the predefined tracepoints
block_rq_complete.EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_complete);
Modify drivers/nvme/host/tcp.c (initiator side)
Modify the drivers/nvme/host/tcp.c file as follows.
-
Export the tracepoints.
#include "nvme.h"
#include "fabrics.h"
+ #include "trace.h"
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/nvme_tcp.h>
+
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_queue_rq);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_queue_request);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_try_send_cmd_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_try_send_data_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_try_send_data);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_done_send_req);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_handle_c2h_data);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_recv_data);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_process_nvme_cqe);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_handle_r2t);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_tcp_try_send);
struct nvme_tcp_queue;
/* Define the socket priority to use for connections were it -
Add an attribute
recv_timetostruct nvme-tcp_queueto record network packet receive time.struct nvme_tcp_queue {
struct socket *sock;
+ u64 recv_time;
struct work_struct io_work;
int io_cpu; -
Enable recording timestamp in socket
nvme_tcp_reclassify_socket(queue->sock);
/* Single syn retry */
tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
tcp_sock_set_nodelay(queue->sock->sk);
+ sock_enable_timestamps(queue->sock->sk); -
Inserts the tracepoints in functions
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
+ trace_nvme_tcp_queue_request(blk_mq_rq_from_pdu(req), req->req.cmd, nvme_tcp_queue_id(queue));
empty = llist_add(&req->lentry, &queue->req_list) &&
list_empty(&queue->send_list) && !queue->request;static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
struct nvme_completion *cqe)
{
struct nvme_tcp_request *req;
struct request *rq;
rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad cqe.command_id %#x on queue %d\n",
cqe->command_id, nvme_tcp_queue_id(queue));
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EINVAL;
}
+ trace_nvme_tcp_process_nvme_cqe(rq, (void *)queue->pdu, nvme_tcp_queue_id(queue), queue->recv_time);
req = blk_mq_rq_to_pdu(rq);
if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
req->status = cqe->status;
if (!nvme_try_complete_req(rq, req->status, cqe->result))
nvme_complete_rq(rq);
queue->nr_cqe++;
return 0;
}static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
struct nvme_tcp_data_pdu *pdu)
{
struct request *rq;
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad c2hdata.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
if (!blk_rq_payload_bytes(rq)) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x unexpected data\n",
nvme_tcp_queue_id(queue), rq->tag);
return -EIO;
}
queue->data_remaining = le32_to_cpu(pdu->data_length);
+ trace_nvme_tcp_handle_c2h_data(rq, nvme_tcp_queue_id(queue), queue->data_remaining, queue->recv_time);
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x SUCCESS set but not last PDU\n",
nvme_tcp_queue_id(queue), rq->tag);
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EPROTO;
}
return 0;
}static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
struct nvme_tcp_r2t_pdu *pdu)
{
struct nvme_tcp_request *req;
struct request *rq;
int ret;
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"got bad r2t.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
+ trace_nvme_tcp_handle_r2t(rq, pdu, nvme_tcp_queue_id(queue), queue->recv_time);
req = blk_mq_rq_to_pdu(rq);
ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
if (unlikely(ret))
return ret;
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
nvme_tcp_queue_request(req, false, true);
return 0;
}static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct nvme_tcp_queue *queue = desc->arg.data;
size_t consumed = len;
int result;
+ queue->recv_time = skb->tstamp;
while (len) {
switch (nvme_tcp_recv_state(queue)) {
case NVME_TCP_RECV_PDU:
result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DATA:
result = nvme_tcp_recv_data(queue, skb, &offset, &len);
break;
case NVME_TCP_RECV_DDGST:
result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
break;
default:
result = -EFAULT;
}
if (result) {
dev_err(queue->ctrl->ctrl.device,
"receive failed: %d\n", result);
queue->rd_enabled = false;
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return result;
}
}
return consumed;
}static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
{
+ trace_nvme_tcp_done_send_req(blk_mq_rq_from_pdu(queue->request), nvme_tcp_queue_id(queue));
queue->request = NULL;
}static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
int req_data_len = req->data_len;
while (true) {
struct page *page = nvme_tcp_req_cur_page(req);
size_t offset = nvme_tcp_req_cur_offset(req);
size_t len = nvme_tcp_req_cur_length(req);
bool last = nvme_tcp_pdu_last_send(req, len);
int req_data_sent = req->data_sent;
int ret, flags = MSG_DONTWAIT;
if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
flags |= MSG_EOR;
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
if (sendpage_ok(page)) {
ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
return ret;
+ trace_nvme_tcp_try_send_data(blk_mq_rq_from_pdu(req), req->pdu, nvme_tcp_queue_id(queue));
if (queue->data_digest)
nvme_tcp_ddgst_update(queue->snd_hash, page,
offset, ret);
/*
* update the request iterator except for the last payload send
* in the request where we don't want to modify it as we may
* compete with the RX path completing the request.
*/
if (req_data_sent + ret < req_data_len)
nvme_tcp_advance_req(req, ret);
/* fully successful last send in current PDU */
if (last && ret == len) {
if (queue->data_digest) {
nvme_tcp_ddgst_final(queue->snd_hash,
&req->ddgst);
req->state = NVME_TCP_SEND_DDGST;
req->offset = 0;
} else {
nvme_tcp_done_send_req(queue);
}
return 1;
}
}
return -EAGAIN;
}static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
int flags = MSG_DONTWAIT;
int ret;
if (inline_data || nvme_tcp_queue_more(queue))
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ trace_nvme_tcp_try_send_cmd_pdu(blk_mq_rq_from_pdu(req), queue->sock, nvme_tcp_queue_id(queue), 0);
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len, flags);
if (unlikely(ret <= 0))
return ret;
len -= ret;
if (!len) {
if (inline_data) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
crypto_ahash_init(queue->snd_hash);
} else {
nvme_tcp_done_send_req(queue);
}
return 1;
}
req->offset += ret;
return -EAGAIN;
}static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_data_pdu *pdu = req->pdu;
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) - req->offset + hdgst;
int ret;
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ trace_nvme_tcp_try_send_data_pdu(blk_mq_rq_from_pdu(req), pdu, nvme_tcp_queue_id(queue));
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (unlikely(ret <= 0))
return ret;
len -= ret;
if (!len) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
crypto_ahash_init(queue->snd_hash);
return 1;
}
req->offset += ret;
return -EAGAIN;
}static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
int ret = 1;
if (!queue->request) {
queue->request = nvme_tcp_fetch_request(queue);
if (!queue->request)
return 0;
}
req = queue->request;
+ trace_nvme_tcp_try_send(blk_mq_rq_from_pdu(req), nvme_tcp_queue_id(queue));
if (req->state == NVME_TCP_SEND_CMD_PDU) {
ret = nvme_tcp_try_send_cmd_pdu(req);
if (ret <= 0)
goto done;
if (!nvme_tcp_has_inline_data(req))
return ret;
}
if (req->state == NVME_TCP_SEND_H2C_PDU) {
ret = nvme_tcp_try_send_data_pdu(req);
if (ret <= 0)
goto done;
}
if (req->state == NVME_TCP_SEND_DATA) {
ret = nvme_tcp_try_send_data(req);
if (ret <= 0)
goto done;
}
if (req->state == NVME_TCP_SEND_DDGST)
ret = nvme_tcp_try_send_ddgst(req);
done:
if (ret == -EAGAIN) {
ret = 0;
} else if (ret < 0) {
dev_err(queue->ctrl->ctrl.device,
"failed to send request %d\n", ret);
nvme_tcp_fail_request(queue->request);
nvme_tcp_done_send_req(queue);
}
return ret;
}static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_tcp_queue *queue = hctx->driver_data;
struct request *rq = bd->rq;
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
blk_status_t ret;
if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
ret = nvme_tcp_setup_cmd_pdu(ns, rq);
if (unlikely(ret))
return ret;
blk_mq_start_request(rq);
+ trace_nvme_tcp_queue_rq(rq, req->pdu, nvme_tcp_queue_id(queue), &queue->req_list, &queue->send_list, &queue->send_mutex);
nvme_tcp_queue_request(req, true, bd->last);
return BLK_STS_OK;
}
Modify drivers/nvme/target/tcp.c (target side)
Modify the drivers/nvme/target/tcp.c file as follows.
-
Export the tracepoint symbols.
#include "nvmet.h"
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/nvmet_tcp.h>
+ #include <linux/tracepoint.h>
+
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_done_recv_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_exec_read_req);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_exec_write_req);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_queue_response);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_setup_c2h_data_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_setup_r2t_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_setup_response_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_try_send_data_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_try_send_r2t);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_try_send_response);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_try_send_data);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_handle_h2c_data_pdu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_tcp_try_recv_data);
+
#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
#define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ -
Add an attribute
recv_timetostruct nvmet_tcp_queueto record network packet receive time.struct nvmet_tcp_queue {
struct socket *sock;
+ s64 recv_time;
struct nvmet_tcp_port *port;
struct work_struct io_work; -
Enable recording timestamp in socket
mutex_lock(&nvmet_tcp_queue_mutex);
list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
mutex_unlock(&nvmet_tcp_queue_mutex);
ret = nvmet_tcp_set_queue_sock(queue);
if (ret)
goto out_destroy_sq;
+ sock_enable_timestamps(queue->sock->sk);
return 0; -
Insert the tracepoints in the following functions.
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
+ trace_nvmet_tcp_setup_c2h_data_pdu(cmd->req.cqe, queue->idx);
cmd->offset = 0;
cmd->state = NVMET_TCP_SEND_DATA_PDU;
pdu->hdr.type = nvme_tcp_c2h_data;
pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
NVME_TCP_F_DATA_SUCCESS : 0);
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
pdu->hdr.plen =
cpu_to_le32(pdu->hdr.hlen + hdgst +
cmd->req.transfer_len + ddgst);
pdu->command_id = cmd->req.cqe->command_id;
pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
if (queue->data_digest) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
}
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
}
}static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ trace_nvmet_tcp_setup_r2t_pdu(cmd->req.cmd, queue->idx);
cmd->offset = 0;
cmd->state = NVMET_TCP_SEND_R2T;
pdu->hdr.type = nvme_tcp_r2t;
pdu->hdr.flags = 0;
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = 0;
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
pdu->command_id = cmd->req.cmd->common.command_id;
pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
}
}static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ trace_nvmet_tcp_setup_response_pdu(cmd->req.cqe, queue->idx);
cmd->offset = 0;
cmd->state = NVMET_TCP_SEND_RESPONSE;
pdu->hdr.type = nvme_tcp_rsp;
pdu->hdr.flags = 0;
pdu->hdr.hlen = sizeof(*pdu);
pdu->hdr.pdo = 0;
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
}
}static void nvmet_tcp_queue_response(struct nvmet_req *req)
{
struct nvmet_tcp_cmd *cmd =
container_of(req, struct nvmet_tcp_cmd, req);
struct nvmet_tcp_queue *queue = cmd->queue;
struct nvme_sgl_desc *sgl;
u32 len;
if (unlikely(cmd == queue->cmd)) {
sgl = &cmd->req.cmd->common.dptr.sgl;
len = le32_to_cpu(sgl->length);
/*
* Wait for inline data before processing the response.
* Avoid using helpers, this might happen before
* nvmet_req_init is completed.
*/
if (queue->rcv_state == NVMET_TCP_RECV_PDU &&
len && len <= cmd->req.port->inline_data_size &&
nvme_is_write(cmd->req.cmd))
return;
}
+ trace_nvmet_tcp_queue_response(cmd->req.cmd, queue->idx);
llist_add(&cmd->lentry, &queue->resp_list);
queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
}static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
{
if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
nvmet_tcp_queue_response(&cmd->req);
- else
+ else {
+ int size = nvme_is_write(cmd->req.cmd) ?
+ cmd->req.transfer_len : 0;
+ trace_nvmet_tcp_exec_write_req(cmd->req.cmd, cmd->queue->idx, size);
cmd->req.execute(&cmd->req);
+ }
}static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
{
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
int ret;
+ trace_nvmet_tcp_try_send_data_pdu(cmd->req.cqe, cmd->data_pdu, cmd->queue->idx, left);
ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
offset_in_page(cmd->data_pdu) + cmd->offset,
left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
cmd->offset += ret;
left -= ret;
if (left)
return -EAGAIN;
cmd->state = NVMET_TCP_SEND_DATA;
cmd->offset = 0;
return 1;
}static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
struct nvmet_tcp_queue *queue = cmd->queue;
int ret;
while (cmd->cur_sg) {
struct page *page = sg_page(cmd->cur_sg);
u32 left = cmd->cur_sg->length - cmd->offset;
int flags = MSG_DONTWAIT;
if ((!last_in_batch && cmd->queue->send_list_len) ||
cmd->wbytes_done + left < cmd->req.transfer_len ||
queue->data_digest || !queue->nvme_sq.sqhd_disabled)
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
left, flags);
if (ret <= 0)
return ret;
+ trace_nvmet_tcp_try_send_data(cmd->req.cqe, cmd->queue->idx, ret);
cmd->offset += ret;
cmd->wbytes_done += ret;
/* Done with sg?*/
if (cmd->offset == cmd->cur_sg->length) {
cmd->cur_sg = sg_next(cmd->cur_sg);
cmd->offset = 0;
}
}
if (queue->data_digest) {
cmd->state = NVMET_TCP_SEND_DDGST;
cmd->offset = 0;
} else {
if (queue->nvme_sq.sqhd_disabled) {
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
} else {
nvmet_setup_response_pdu(cmd);
}
}
if (queue->nvme_sq.sqhd_disabled) {
kfree(cmd->iov);
sgl_free(cmd->req.sg);
}
return 1;
}static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
bool last_in_batch)
{
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
int flags = MSG_DONTWAIT;
int ret;
if (!last_in_batch && cmd->queue->send_list_len)
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
+ trace_nvmet_tcp_try_send_response(cmd->req.cqe, cmd->rsp_pdu, cmd->queue->idx, left);
ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
if (ret <= 0)
return ret;
cmd->offset += ret;
left -= ret;
if (left)
return -EAGAIN;
kfree(cmd->iov);
sgl_free(cmd->req.sg);
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
return 1;
}static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
int flags = MSG_DONTWAIT;
int ret;
if (!last_in_batch && cmd->queue->send_list_len)
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
+ trace_nvmet_tcp_try_send_r2t(cmd->req.cmd, cmd->r2t_pdu, cmd->queue->idx, left);
ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
if (ret <= 0)
return ret;
cmd->offset += ret;
left -= ret;
if (left)
return -EAGAIN;
cmd->queue->snd_cmd = NULL;
return 1;
}static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_data_pdu *data = &queue->pdu.data;
struct nvmet_tcp_cmd *cmd;
unsigned int exp_data_len;
if (likely(queue->nr_cmds)) {
if (unlikely(data->ttag >= queue->nr_cmds)) {
pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
queue->idx, data->ttag, queue->nr_cmds);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
cmd = &queue->cmds[data->ttag];
} else {
cmd = &queue->connect;
}
if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
pr_err("ttag %u unexpected data offset %u (expected %u)\n",
data->ttag, le32_to_cpu(data->data_offset),
cmd->rbytes_done);
/* FIXME: use path and transport errors */
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
exp_data_len = le32_to_cpu(data->hdr.plen) -
nvmet_tcp_hdgst_len(queue) -
nvmet_tcp_ddgst_len(queue) -
sizeof(*data);
+ trace_nvmet_tcp_handle_h2c_data_pdu(data, cmd->req.cmd, queue->idx, exp_data_len, queue->recv_time);
cmd->pdu_len = le32_to_cpu(data->data_length);
if (unlikely(cmd->pdu_len != exp_data_len ||
cmd->pdu_len == 0 ||
cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) {
pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len);
/* FIXME: use proper transport errors */
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
cmd->pdu_recv = 0;
nvmet_tcp_map_pdu_iovec(cmd);
queue->cmd = cmd;
queue->rcv_state = NVMET_TCP_RECV_DATA;
return 0;
}static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
struct nvmet_req *req;
int ret;
if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
if (hdr->type != nvme_tcp_icreq) {
pr_err("unexpected pdu type (%d) before icreq\n",
hdr->type);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
return nvmet_tcp_handle_icreq(queue);
}
if (hdr->type == nvme_tcp_h2c_data) {
ret = nvmet_tcp_handle_h2c_data_pdu(queue);
if (unlikely(ret))
return ret;
return 0;
}
queue->cmd = nvmet_tcp_get_cmd(queue);
if (unlikely(!queue->cmd)) {
/* This should never happen */
pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
queue->idx, queue->nr_cmds, queue->send_list_len,
nvme_cmd->common.opcode);
nvmet_tcp_fatal_error(queue);
return -ENOMEM;
}
req = &queue->cmd->req;
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+ trace_nvmet_tcp_done_recv_pdu(&queue->pdu.cmd, queue->idx, queue->recv_time);
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
le32_to_cpu(req->cmd->common.dptr.sgl.length));
nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
return 0;
}
ret = nvmet_tcp_map_data(queue->cmd);
if (unlikely(ret)) {
pr_err("queue %d: failed to map data\n", queue->idx);
if (nvmet_tcp_has_inline_data(queue->cmd))
nvmet_tcp_fatal_error(queue);
else
nvmet_req_complete(req, ret);
ret = -EAGAIN;
goto out;
}
if (nvmet_tcp_need_data_in(queue->cmd)) {
if (nvmet_tcp_has_inline_data(queue->cmd)) {
queue->rcv_state = NVMET_TCP_RECV_DATA;
nvmet_tcp_map_pdu_iovec(queue->cmd);
return 0;
}
/* send back R2T */
nvmet_tcp_queue_response(&queue->cmd->req);
goto out;
}
trace_nvmet_tcp_exec_read_req(req->cmd, queue->idx);
queue->cmd->req.execute(&queue->cmd->req);
out:
nvmet_prepare_receive_pdu(queue);
return ret;
}static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
struct nvmet_req *req;
int ret;
if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
if (hdr->type != nvme_tcp_icreq) {
pr_err("unexpected pdu type (%d) before icreq\n",
hdr->type);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
return nvmet_tcp_handle_icreq(queue);
}
if (hdr->type == nvme_tcp_h2c_data) {
ret = nvmet_tcp_handle_h2c_data_pdu(queue);
if (unlikely(ret))
return ret;
return 0;
}
queue->cmd = nvmet_tcp_get_cmd(queue);
if (unlikely(!queue->cmd)) {
/* This should never happen */
pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
queue->idx, queue->nr_cmds, queue->send_list_len,
nvme_cmd->common.opcode);
nvmet_tcp_fatal_error(queue);
return -ENOMEM;
}
req = &queue->cmd->req;
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+ trace_nvmet_tcp_done_recv_pdu(&queue->pdu.cmd, queue->idx, queue->recv_time);
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
le32_to_cpu(req->cmd->common.dptr.sgl.length));
nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
return 0;
}
ret = nvmet_tcp_map_data(queue->cmd);
if (unlikely(ret)) {
pr_err("queue %d: failed to map data\n", queue->idx);
if (nvmet_tcp_has_inline_data(queue->cmd))
nvmet_tcp_fatal_error(queue);
else
nvmet_req_complete(req, ret);
ret = -EAGAIN;
goto out;
}
if (nvmet_tcp_need_data_in(queue->cmd)) {
if (nvmet_tcp_has_inline_data(queue->cmd)) {
queue->rcv_state = NVMET_TCP_RECV_DATA;
nvmet_tcp_map_pdu_iovec(queue->cmd);
return 0;
}
/* send back R2T */
nvmet_tcp_queue_response(&queue->cmd->req);
goto out;
}
trace_nvmet_tcp_exec_read_req(req->cmd, queue->idx);
queue->cmd->req.execute(&queue->cmd->req);
out:
nvmet_prepare_receive_pdu(queue);
return ret;
}static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
int len;
struct kvec iov;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+ queue->recv_time = 0;
recv:
+ char ckbuf[CMSG_SPACE(sizeof(struct __kernel_old_timespec))];
+ msg.msg_control = ckbuf;
+ msg.msg_controllen = sizeof(ckbuf);
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
iov.iov_base = (void *)&queue->pdu + queue->offset;
iov.iov_len = queue->left;
len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
if (unlikely(len < 0))
return len;
+ if(cmsg && queue->recv_time == 0){
+ struct __kernel_old_timespec * ts = (struct __kernel_old_timespec *)CMSG_DATA(cmsg);
+ queue->recv_time = ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+ }
queue->offset += len;
queue->left -= len;
if (queue->left)
return -EAGAIN;
if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
u8 hdgst = nvmet_tcp_hdgst_len(queue);
if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
pr_err("unexpected pdu type %d\n", hdr->type);
nvmet_tcp_fatal_error(queue);
return -EIO;
}
if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
return -EIO;
}
queue->left = hdr->hlen - queue->offset + hdgst;
goto recv;
}
if (queue->hdr_digest &&
nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
nvmet_tcp_fatal_error(queue); /* fatal */
return -EPROTO;
}
if (queue->data_digest &&
nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
nvmet_tcp_fatal_error(queue); /* fatal */
return -EPROTO;
}
return nvmet_tcp_done_recv_pdu(queue);
}static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmd;
int ret;
while (msg_data_left(&cmd->recv_msg)) {
+ char ckbuf[CMSG_SPACE(sizeof(struct __kernel_old_timespec))];
+ cmd->recv_msg.msg_control = ckbuf;
+ cmd->recv_msg.msg_controllen = sizeof(ckbuf);
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&cmd->recv_msg);
ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
cmd->recv_msg.msg_flags);
if (ret <= 0)
return ret;
+ if(cmsg && CMSG_DATA(cmsg)){
+ struct __kernel_old_timespec * ts = (struct __kernel_old_timespec *)CMSG_DATA(cmsg);
+ long long recv_time = ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+ trace_nvmet_tcp_try_recv_data(cmd->req.cmd, cmd->queue->idx, ret, recv_time);
+ }
cmd->pdu_recv += ret;
cmd->rbytes_done += ret;
}
nvmet_tcp_unmap_pdu_iovec(cmd);
if (queue->data_digest) {
nvmet_tcp_prep_recv_ddgst(cmd);
return 0;
}
if (cmd->rbytes_done == cmd->req.transfer_len)
nvmet_tcp_execute_request(cmd);
nvmet_prepare_receive_pdu(queue);
return 0;
}
Modify include/linux/nvme-tcp.h (both sides)
Modify the include/linux/nvme-tcp.h file as follows.
-
Define a struct to store profiling data from target side.
struct nvme_tcp_hdr {
__u8 type;
__u8 flags;
__u8 hlen;
__u8 pdo;
__le32 plen;
};
+ #define MAX_NVME_TCP_EVENTS 10
+ struct ntprof_stat{
+ __u64 id;
+ __u8 tag;
+ __u8 cnt;
+ __u8 event[MAX_NVME_TCP_EVENTS];
+ __u64 ts[MAX_NVME_TCP_EVENTS];
+ }; -
Add the
ntprof_statat the end of each type of PDUstruct nvme_tcp_icreq_pdu {
struct nvme_tcp_hdr hdr;
__le16 pfv;
__u8 hpda;
__u8 digest;
__le32 maxr2t;
__u8 rsvd2[112];
+ struct ntprof_stat stat;
};struct nvme_tcp_icresp_pdu {
struct nvme_tcp_hdr hdr;
__le16 pfv;
__u8 cpda;
__u8 digest;
__le32 maxdata;
__u8 rsvd[112];
+ struct ntprof_stat stat;
};struct nvme_tcp_term_pdu {
struct nvme_tcp_hdr hdr;
__le16 fes;
__le16 feil;
__le16 feiu;
__u8 rsvd[10];
+ struct ntprof_stat stat;
};struct nvme_tcp_cmd_pdu {
struct nvme_tcp_hdr hdr;
struct nvme_command cmd;
+ struct ntprof_stat stat;
};struct nvme_tcp_rsp_pdu {
struct nvme_tcp_hdr hdr;
struct nvme_completion cqe;
+ struct ntprof_stat stat;
};struct nvme_tcp_r2t_pdu {
struct nvme_tcp_hdr hdr;
__u16 command_id;
__u16 ttag;
__le32 r2t_offset;
__le32 r2t_length;
__u8 rsvd[4];
+ struct ntprof_stat stat;
};struct nvme_tcp_data_pdu {
struct nvme_tcp_hdr hdr;
__u16 command_id;
__u16 ttag;
__le32 data_offset;
__le32 data_length;
__u8 rsvd[4];
+ struct ntprof_stat stat;
};