Add qemu 2.4.0
[kvmfornfv.git] / qemu / hw / block / nvme.c
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.1, 1.0e
13  *
14  *  http://www.nvmexpress.org/resources/
15  */
16
17 /**
18  * Usage: add options:
19  *      -drive file=<file>,if=none,id=<drive_id>
20  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
21  */
22
23 #include <hw/block/block.h>
24 #include <hw/hw.h>
25 #include <hw/pci/msix.h>
26 #include <hw/pci/pci.h>
27 #include "sysemu/sysemu.h"
28 #include "qapi/visitor.h"
29 #include "sysemu/block-backend.h"
30
31 #include "nvme.h"
32
33 static void nvme_process_sq(void *opaque);
34
35 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
36 {
37     return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
38 }
39
40 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
41 {
42     return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
43 }
44
45 static void nvme_inc_cq_tail(NvmeCQueue *cq)
46 {
47     cq->tail++;
48     if (cq->tail >= cq->size) {
49         cq->tail = 0;
50         cq->phase = !cq->phase;
51     }
52 }
53
54 static void nvme_inc_sq_head(NvmeSQueue *sq)
55 {
56     sq->head = (sq->head + 1) % sq->size;
57 }
58
59 static uint8_t nvme_cq_full(NvmeCQueue *cq)
60 {
61     return (cq->tail + 1) % cq->size == cq->head;
62 }
63
64 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
65 {
66     return sq->head == sq->tail;
67 }
68
69 static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
70 {
71     if (cq->irq_enabled) {
72         if (msix_enabled(&(n->parent_obj))) {
73             msix_notify(&(n->parent_obj), cq->vector);
74         } else {
75             pci_irq_pulse(&n->parent_obj);
76         }
77     }
78 }
79
80 static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
81     uint32_t len, NvmeCtrl *n)
82 {
83     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
84     trans_len = MIN(len, trans_len);
85     int num_prps = (len >> n->page_bits) + 1;
86
87     if (!prp1) {
88         return NVME_INVALID_FIELD | NVME_DNR;
89     }
90
91     pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
92     qemu_sglist_add(qsg, prp1, trans_len);
93     len -= trans_len;
94     if (len) {
95         if (!prp2) {
96             goto unmap;
97         }
98         if (len > n->page_size) {
99             uint64_t prp_list[n->max_prp_ents];
100             uint32_t nents, prp_trans;
101             int i = 0;
102
103             nents = (len + n->page_size - 1) >> n->page_bits;
104             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
105             pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
106             while (len != 0) {
107                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
108
109                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
110                     if (!prp_ent || prp_ent & (n->page_size - 1)) {
111                         goto unmap;
112                     }
113
114                     i = 0;
115                     nents = (len + n->page_size - 1) >> n->page_bits;
116                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
117                     pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
118                         prp_trans);
119                     prp_ent = le64_to_cpu(prp_list[i]);
120                 }
121
122                 if (!prp_ent || prp_ent & (n->page_size - 1)) {
123                     goto unmap;
124                 }
125
126                 trans_len = MIN(len, n->page_size);
127                 qemu_sglist_add(qsg, prp_ent, trans_len);
128                 len -= trans_len;
129                 i++;
130             }
131         } else {
132             if (prp2 & (n->page_size - 1)) {
133                 goto unmap;
134             }
135             qemu_sglist_add(qsg, prp2, len);
136         }
137     }
138     return NVME_SUCCESS;
139
140  unmap:
141     qemu_sglist_destroy(qsg);
142     return NVME_INVALID_FIELD | NVME_DNR;
143 }
144
145 static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
146     uint64_t prp1, uint64_t prp2)
147 {
148     QEMUSGList qsg;
149
150     if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
151         return NVME_INVALID_FIELD | NVME_DNR;
152     }
153     if (dma_buf_read(ptr, len, &qsg)) {
154         qemu_sglist_destroy(&qsg);
155         return NVME_INVALID_FIELD | NVME_DNR;
156     }
157     qemu_sglist_destroy(&qsg);
158     return NVME_SUCCESS;
159 }
160
161 static void nvme_post_cqes(void *opaque)
162 {
163     NvmeCQueue *cq = opaque;
164     NvmeCtrl *n = cq->ctrl;
165     NvmeRequest *req, *next;
166
167     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
168         NvmeSQueue *sq;
169         hwaddr addr;
170
171         if (nvme_cq_full(cq)) {
172             break;
173         }
174
175         QTAILQ_REMOVE(&cq->req_list, req, entry);
176         sq = req->sq;
177         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
178         req->cqe.sq_id = cpu_to_le16(sq->sqid);
179         req->cqe.sq_head = cpu_to_le16(sq->head);
180         addr = cq->dma_addr + cq->tail * n->cqe_size;
181         nvme_inc_cq_tail(cq);
182         pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
183             sizeof(req->cqe));
184         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
185     }
186     nvme_isr_notify(n, cq);
187 }
188
189 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
190 {
191     assert(cq->cqid == req->sq->cqid);
192     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
193     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
194     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
195 }
196
197 static void nvme_rw_cb(void *opaque, int ret)
198 {
199     NvmeRequest *req = opaque;
200     NvmeSQueue *sq = req->sq;
201     NvmeCtrl *n = sq->ctrl;
202     NvmeCQueue *cq = n->cq[sq->cqid];
203
204     block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
205     if (!ret) {
206         req->status = NVME_SUCCESS;
207     } else {
208         req->status = NVME_INTERNAL_DEV_ERROR;
209     }
210     if (req->has_sg) {
211         qemu_sglist_destroy(&req->qsg);
212     }
213     nvme_enqueue_req_completion(cq, req);
214 }
215
216 static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
217     NvmeRequest *req)
218 {
219     req->has_sg = false;
220     block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
221          BLOCK_ACCT_FLUSH);
222     req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
223
224     return NVME_NO_COMPLETE;
225 }
226
227 static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
228     NvmeRequest *req)
229 {
230     NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
231     uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
232     uint64_t slba = le64_to_cpu(rw->slba);
233     uint64_t prp1 = le64_to_cpu(rw->prp1);
234     uint64_t prp2 = le64_to_cpu(rw->prp2);
235
236     uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
237     uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
238     uint64_t data_size = (uint64_t)nlb << data_shift;
239     uint64_t aio_slba  = slba << (data_shift - BDRV_SECTOR_BITS);
240     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
241
242     if ((slba + nlb) > ns->id_ns.nsze) {
243         return NVME_LBA_RANGE | NVME_DNR;
244     }
245     if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
246         return NVME_INVALID_FIELD | NVME_DNR;
247     }
248     assert((nlb << data_shift) == req->qsg.size);
249
250     req->has_sg = true;
251     dma_acct_start(n->conf.blk, &req->acct, &req->qsg,
252                    is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
253     req->aiocb = is_write ?
254         dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) :
255         dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req);
256
257     return NVME_NO_COMPLETE;
258 }
259
260 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
261 {
262     NvmeNamespace *ns;
263     uint32_t nsid = le32_to_cpu(cmd->nsid);
264
265     if (nsid == 0 || nsid > n->num_namespaces) {
266         return NVME_INVALID_NSID | NVME_DNR;
267     }
268
269     ns = &n->namespaces[nsid - 1];
270     switch (cmd->opcode) {
271     case NVME_CMD_FLUSH:
272         return nvme_flush(n, ns, cmd, req);
273     case NVME_CMD_WRITE:
274     case NVME_CMD_READ:
275         return nvme_rw(n, ns, cmd, req);
276     default:
277         return NVME_INVALID_OPCODE | NVME_DNR;
278     }
279 }
280
281 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
282 {
283     n->sq[sq->sqid] = NULL;
284     timer_del(sq->timer);
285     timer_free(sq->timer);
286     g_free(sq->io_req);
287     if (sq->sqid) {
288         g_free(sq);
289     }
290 }
291
292 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
293 {
294     NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
295     NvmeRequest *req, *next;
296     NvmeSQueue *sq;
297     NvmeCQueue *cq;
298     uint16_t qid = le16_to_cpu(c->qid);
299
300     if (!qid || nvme_check_sqid(n, qid)) {
301         return NVME_INVALID_QID | NVME_DNR;
302     }
303
304     sq = n->sq[qid];
305     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
306         req = QTAILQ_FIRST(&sq->out_req_list);
307         assert(req->aiocb);
308         blk_aio_cancel(req->aiocb);
309     }
310     if (!nvme_check_cqid(n, sq->cqid)) {
311         cq = n->cq[sq->cqid];
312         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
313
314         nvme_post_cqes(cq);
315         QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
316             if (req->sq == sq) {
317                 QTAILQ_REMOVE(&cq->req_list, req, entry);
318                 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
319             }
320         }
321     }
322
323     nvme_free_sq(sq, n);
324     return NVME_SUCCESS;
325 }
326
327 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
328     uint16_t sqid, uint16_t cqid, uint16_t size)
329 {
330     int i;
331     NvmeCQueue *cq;
332
333     sq->ctrl = n;
334     sq->dma_addr = dma_addr;
335     sq->sqid = sqid;
336     sq->size = size;
337     sq->cqid = cqid;
338     sq->head = sq->tail = 0;
339     sq->io_req = g_new(NvmeRequest, sq->size);
340
341     QTAILQ_INIT(&sq->req_list);
342     QTAILQ_INIT(&sq->out_req_list);
343     for (i = 0; i < sq->size; i++) {
344         sq->io_req[i].sq = sq;
345         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
346     }
347     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
348
349     assert(n->cq[cqid]);
350     cq = n->cq[cqid];
351     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
352     n->sq[sqid] = sq;
353 }
354
355 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
356 {
357     NvmeSQueue *sq;
358     NvmeCreateSq *c = (NvmeCreateSq *)cmd;
359
360     uint16_t cqid = le16_to_cpu(c->cqid);
361     uint16_t sqid = le16_to_cpu(c->sqid);
362     uint16_t qsize = le16_to_cpu(c->qsize);
363     uint16_t qflags = le16_to_cpu(c->sq_flags);
364     uint64_t prp1 = le64_to_cpu(c->prp1);
365
366     if (!cqid || nvme_check_cqid(n, cqid)) {
367         return NVME_INVALID_CQID | NVME_DNR;
368     }
369     if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) {
370         return NVME_INVALID_QID | NVME_DNR;
371     }
372     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
373         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
374     }
375     if (!prp1 || prp1 & (n->page_size - 1)) {
376         return NVME_INVALID_FIELD | NVME_DNR;
377     }
378     if (!(NVME_SQ_FLAGS_PC(qflags))) {
379         return NVME_INVALID_FIELD | NVME_DNR;
380     }
381     sq = g_malloc0(sizeof(*sq));
382     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
383     return NVME_SUCCESS;
384 }
385
386 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
387 {
388     n->cq[cq->cqid] = NULL;
389     timer_del(cq->timer);
390     timer_free(cq->timer);
391     msix_vector_unuse(&n->parent_obj, cq->vector);
392     if (cq->cqid) {
393         g_free(cq);
394     }
395 }
396
397 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
398 {
399     NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
400     NvmeCQueue *cq;
401     uint16_t qid = le16_to_cpu(c->qid);
402
403     if (!qid || nvme_check_cqid(n, qid)) {
404         return NVME_INVALID_CQID | NVME_DNR;
405     }
406
407     cq = n->cq[qid];
408     if (!QTAILQ_EMPTY(&cq->sq_list)) {
409         return NVME_INVALID_QUEUE_DEL;
410     }
411     nvme_free_cq(cq, n);
412     return NVME_SUCCESS;
413 }
414
415 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
416     uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
417 {
418     cq->ctrl = n;
419     cq->cqid = cqid;
420     cq->size = size;
421     cq->dma_addr = dma_addr;
422     cq->phase = 1;
423     cq->irq_enabled = irq_enabled;
424     cq->vector = vector;
425     cq->head = cq->tail = 0;
426     QTAILQ_INIT(&cq->req_list);
427     QTAILQ_INIT(&cq->sq_list);
428     msix_vector_use(&n->parent_obj, cq->vector);
429     n->cq[cqid] = cq;
430     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
431 }
432
433 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
434 {
435     NvmeCQueue *cq;
436     NvmeCreateCq *c = (NvmeCreateCq *)cmd;
437     uint16_t cqid = le16_to_cpu(c->cqid);
438     uint16_t vector = le16_to_cpu(c->irq_vector);
439     uint16_t qsize = le16_to_cpu(c->qsize);
440     uint16_t qflags = le16_to_cpu(c->cq_flags);
441     uint64_t prp1 = le64_to_cpu(c->prp1);
442
443     if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) {
444         return NVME_INVALID_CQID | NVME_DNR;
445     }
446     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
447         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
448     }
449     if (!prp1) {
450         return NVME_INVALID_FIELD | NVME_DNR;
451     }
452     if (vector > n->num_queues) {
453         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
454     }
455     if (!(NVME_CQ_FLAGS_PC(qflags))) {
456         return NVME_INVALID_FIELD | NVME_DNR;
457     }
458
459     cq = g_malloc0(sizeof(*cq));
460     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
461         NVME_CQ_FLAGS_IEN(qflags));
462     return NVME_SUCCESS;
463 }
464
465 static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
466 {
467     NvmeNamespace *ns;
468     NvmeIdentify *c = (NvmeIdentify *)cmd;
469     uint32_t cns  = le32_to_cpu(c->cns);
470     uint32_t nsid = le32_to_cpu(c->nsid);
471     uint64_t prp1 = le64_to_cpu(c->prp1);
472     uint64_t prp2 = le64_to_cpu(c->prp2);
473
474     if (cns) {
475         return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
476             prp1, prp2);
477     }
478     if (nsid == 0 || nsid > n->num_namespaces) {
479         return NVME_INVALID_NSID | NVME_DNR;
480     }
481
482     ns = &n->namespaces[nsid - 1];
483     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
484         prp1, prp2);
485 }
486
487 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
488 {
489     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
490     uint32_t result;
491
492     switch (dw10) {
493     case NVME_VOLATILE_WRITE_CACHE:
494         result = blk_enable_write_cache(n->conf.blk);
495         break;
496     case NVME_NUMBER_OF_QUEUES:
497         result = cpu_to_le32((n->num_queues - 1) | ((n->num_queues - 1) << 16));
498         break;
499     default:
500         return NVME_INVALID_FIELD | NVME_DNR;
501     }
502
503     req->cqe.result = result;
504     return NVME_SUCCESS;
505 }
506
507 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
508 {
509     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
510     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
511
512     switch (dw10) {
513     case NVME_VOLATILE_WRITE_CACHE:
514         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
515         break;
516     case NVME_NUMBER_OF_QUEUES:
517         req->cqe.result =
518             cpu_to_le32((n->num_queues - 1) | ((n->num_queues - 1) << 16));
519         break;
520     default:
521         return NVME_INVALID_FIELD | NVME_DNR;
522     }
523     return NVME_SUCCESS;
524 }
525
526 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
527 {
528     switch (cmd->opcode) {
529     case NVME_ADM_CMD_DELETE_SQ:
530         return nvme_del_sq(n, cmd);
531     case NVME_ADM_CMD_CREATE_SQ:
532         return nvme_create_sq(n, cmd);
533     case NVME_ADM_CMD_DELETE_CQ:
534         return nvme_del_cq(n, cmd);
535     case NVME_ADM_CMD_CREATE_CQ:
536         return nvme_create_cq(n, cmd);
537     case NVME_ADM_CMD_IDENTIFY:
538         return nvme_identify(n, cmd);
539     case NVME_ADM_CMD_SET_FEATURES:
540         return nvme_set_feature(n, cmd, req);
541     case NVME_ADM_CMD_GET_FEATURES:
542         return nvme_get_feature(n, cmd, req);
543     default:
544         return NVME_INVALID_OPCODE | NVME_DNR;
545     }
546 }
547
548 static void nvme_process_sq(void *opaque)
549 {
550     NvmeSQueue *sq = opaque;
551     NvmeCtrl *n = sq->ctrl;
552     NvmeCQueue *cq = n->cq[sq->cqid];
553
554     uint16_t status;
555     hwaddr addr;
556     NvmeCmd cmd;
557     NvmeRequest *req;
558
559     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
560         addr = sq->dma_addr + sq->head * n->sqe_size;
561         pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
562         nvme_inc_sq_head(sq);
563
564         req = QTAILQ_FIRST(&sq->req_list);
565         QTAILQ_REMOVE(&sq->req_list, req, entry);
566         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
567         memset(&req->cqe, 0, sizeof(req->cqe));
568         req->cqe.cid = cmd.cid;
569
570         status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
571             nvme_admin_cmd(n, &cmd, req);
572         if (status != NVME_NO_COMPLETE) {
573             req->status = status;
574             nvme_enqueue_req_completion(cq, req);
575         }
576     }
577 }
578
579 static void nvme_clear_ctrl(NvmeCtrl *n)
580 {
581     int i;
582
583     for (i = 0; i < n->num_queues; i++) {
584         if (n->sq[i] != NULL) {
585             nvme_free_sq(n->sq[i], n);
586         }
587     }
588     for (i = 0; i < n->num_queues; i++) {
589         if (n->cq[i] != NULL) {
590             nvme_free_cq(n->cq[i], n);
591         }
592     }
593
594     blk_flush(n->conf.blk);
595     n->bar.cc = 0;
596 }
597
598 static int nvme_start_ctrl(NvmeCtrl *n)
599 {
600     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
601     uint32_t page_size = 1 << page_bits;
602
603     if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
604             n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
605             NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
606             NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
607             NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
608             NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
609             NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
610             NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
611             !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
612         return -1;
613     }
614
615     n->page_bits = page_bits;
616     n->page_size = page_size;
617     n->max_prp_ents = n->page_size / sizeof(uint64_t);
618     n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
619     n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
620     nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
621         NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
622     nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
623         NVME_AQA_ASQS(n->bar.aqa) + 1);
624
625     return 0;
626 }
627
628 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
629     unsigned size)
630 {
631     switch (offset) {
632     case 0xc:
633         n->bar.intms |= data & 0xffffffff;
634         n->bar.intmc = n->bar.intms;
635         break;
636     case 0x10:
637         n->bar.intms &= ~(data & 0xffffffff);
638         n->bar.intmc = n->bar.intms;
639         break;
640     case 0x14:
641         /* Windows first sends data, then sends enable bit */
642         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
643             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
644         {
645             n->bar.cc = data;
646         }
647
648         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
649             n->bar.cc = data;
650             if (nvme_start_ctrl(n)) {
651                 n->bar.csts = NVME_CSTS_FAILED;
652             } else {
653                 n->bar.csts = NVME_CSTS_READY;
654             }
655         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
656             nvme_clear_ctrl(n);
657             n->bar.csts &= ~NVME_CSTS_READY;
658         }
659         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
660                 nvme_clear_ctrl(n);
661                 n->bar.cc = data;
662                 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
663         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
664                 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
665                 n->bar.cc = data;
666         }
667         break;
668     case 0x24:
669         n->bar.aqa = data & 0xffffffff;
670         break;
671     case 0x28:
672         n->bar.asq = data;
673         break;
674     case 0x2c:
675         n->bar.asq |= data << 32;
676         break;
677     case 0x30:
678         n->bar.acq = data;
679         break;
680     case 0x34:
681         n->bar.acq |= data << 32;
682         break;
683     default:
684         break;
685     }
686 }
687
688 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
689 {
690     NvmeCtrl *n = (NvmeCtrl *)opaque;
691     uint8_t *ptr = (uint8_t *)&n->bar;
692     uint64_t val = 0;
693
694     if (addr < sizeof(n->bar)) {
695         memcpy(&val, ptr + addr, size);
696     }
697     return val;
698 }
699
700 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
701 {
702     uint32_t qid;
703
704     if (addr & ((1 << 2) - 1)) {
705         return;
706     }
707
708     if (((addr - 0x1000) >> 2) & 1) {
709         uint16_t new_head = val & 0xffff;
710         int start_sqs;
711         NvmeCQueue *cq;
712
713         qid = (addr - (0x1000 + (1 << 2))) >> 3;
714         if (nvme_check_cqid(n, qid)) {
715             return;
716         }
717
718         cq = n->cq[qid];
719         if (new_head >= cq->size) {
720             return;
721         }
722
723         start_sqs = nvme_cq_full(cq) ? 1 : 0;
724         cq->head = new_head;
725         if (start_sqs) {
726             NvmeSQueue *sq;
727             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
728                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
729             }
730             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
731         }
732
733         if (cq->tail != cq->head) {
734             nvme_isr_notify(n, cq);
735         }
736     } else {
737         uint16_t new_tail = val & 0xffff;
738         NvmeSQueue *sq;
739
740         qid = (addr - 0x1000) >> 3;
741         if (nvme_check_sqid(n, qid)) {
742             return;
743         }
744
745         sq = n->sq[qid];
746         if (new_tail >= sq->size) {
747             return;
748         }
749
750         sq->tail = new_tail;
751         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
752     }
753 }
754
755 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
756     unsigned size)
757 {
758     NvmeCtrl *n = (NvmeCtrl *)opaque;
759     if (addr < sizeof(n->bar)) {
760         nvme_write_bar(n, addr, data, size);
761     } else if (addr >= 0x1000) {
762         nvme_process_db(n, addr, data);
763     }
764 }
765
766 static const MemoryRegionOps nvme_mmio_ops = {
767     .read = nvme_mmio_read,
768     .write = nvme_mmio_write,
769     .endianness = DEVICE_LITTLE_ENDIAN,
770     .impl = {
771         .min_access_size = 2,
772         .max_access_size = 8,
773     },
774 };
775
776 static int nvme_init(PCIDevice *pci_dev)
777 {
778     NvmeCtrl *n = NVME(pci_dev);
779     NvmeIdCtrl *id = &n->id_ctrl;
780
781     int i;
782     int64_t bs_size;
783     uint8_t *pci_conf;
784
785     if (!n->conf.blk) {
786         return -1;
787     }
788
789     bs_size = blk_getlength(n->conf.blk);
790     if (bs_size < 0) {
791         return -1;
792     }
793
794     blkconf_serial(&n->conf, &n->serial);
795     if (!n->serial) {
796         return -1;
797     }
798     blkconf_blocksizes(&n->conf);
799
800     pci_conf = pci_dev->config;
801     pci_conf[PCI_INTERRUPT_PIN] = 1;
802     pci_config_set_prog_interface(pci_dev->config, 0x2);
803     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
804     pcie_endpoint_cap_init(&n->parent_obj, 0x80);
805
806     n->num_namespaces = 1;
807     n->num_queues = 64;
808     n->reg_size = 1 << qemu_fls(0x1004 + 2 * (n->num_queues + 1) * 4);
809     n->ns_size = bs_size / (uint64_t)n->num_namespaces;
810
811     n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
812     n->sq = g_new0(NvmeSQueue *, n->num_queues);
813     n->cq = g_new0(NvmeCQueue *, n->num_queues);
814
815     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
816                           "nvme", n->reg_size);
817     pci_register_bar(&n->parent_obj, 0,
818         PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
819         &n->iomem);
820     msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4);
821
822     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
823     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
824     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
825     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
826     strpadcpy((char *)id->sn, sizeof(id->sn), n->serial, ' ');
827     id->rab = 6;
828     id->ieee[0] = 0x00;
829     id->ieee[1] = 0x02;
830     id->ieee[2] = 0xb3;
831     id->oacs = cpu_to_le16(0);
832     id->frmw = 7 << 1;
833     id->lpa = 1 << 0;
834     id->sqes = (0x6 << 4) | 0x6;
835     id->cqes = (0x4 << 4) | 0x4;
836     id->nn = cpu_to_le32(n->num_namespaces);
837     id->psd[0].mp = cpu_to_le16(0x9c4);
838     id->psd[0].enlat = cpu_to_le32(0x10);
839     id->psd[0].exlat = cpu_to_le32(0x4);
840     if (blk_enable_write_cache(n->conf.blk)) {
841         id->vwc = 1;
842     }
843
844     n->bar.cap = 0;
845     NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
846     NVME_CAP_SET_CQR(n->bar.cap, 1);
847     NVME_CAP_SET_AMS(n->bar.cap, 1);
848     NVME_CAP_SET_TO(n->bar.cap, 0xf);
849     NVME_CAP_SET_CSS(n->bar.cap, 1);
850     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
851
852     n->bar.vs = 0x00010100;
853     n->bar.intmc = n->bar.intms = 0;
854
855     for (i = 0; i < n->num_namespaces; i++) {
856         NvmeNamespace *ns = &n->namespaces[i];
857         NvmeIdNs *id_ns = &ns->id_ns;
858         id_ns->nsfeat = 0;
859         id_ns->nlbaf = 0;
860         id_ns->flbas = 0;
861         id_ns->mc = 0;
862         id_ns->dpc = 0;
863         id_ns->dps = 0;
864         id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
865         id_ns->ncap  = id_ns->nuse = id_ns->nsze =
866             cpu_to_le64(n->ns_size >>
867                 id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
868     }
869     return 0;
870 }
871
872 static void nvme_exit(PCIDevice *pci_dev)
873 {
874     NvmeCtrl *n = NVME(pci_dev);
875
876     nvme_clear_ctrl(n);
877     g_free(n->namespaces);
878     g_free(n->cq);
879     g_free(n->sq);
880     msix_uninit_exclusive_bar(pci_dev);
881 }
882
883 static Property nvme_props[] = {
884     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
885     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
886     DEFINE_PROP_END_OF_LIST(),
887 };
888
889 static const VMStateDescription nvme_vmstate = {
890     .name = "nvme",
891     .unmigratable = 1,
892 };
893
894 static void nvme_class_init(ObjectClass *oc, void *data)
895 {
896     DeviceClass *dc = DEVICE_CLASS(oc);
897     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
898
899     pc->init = nvme_init;
900     pc->exit = nvme_exit;
901     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
902     pc->vendor_id = PCI_VENDOR_ID_INTEL;
903     pc->device_id = 0x5845;
904     pc->revision = 1;
905     pc->is_express = 1;
906
907     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
908     dc->desc = "Non-Volatile Memory Express";
909     dc->props = nvme_props;
910     dc->vmsd = &nvme_vmstate;
911 }
912
913 static void nvme_get_bootindex(Object *obj, Visitor *v, void *opaque,
914                                   const char *name, Error **errp)
915 {
916     NvmeCtrl *s = NVME(obj);
917
918     visit_type_int32(v, &s->conf.bootindex, name, errp);
919 }
920
921 static void nvme_set_bootindex(Object *obj, Visitor *v, void *opaque,
922                                   const char *name, Error **errp)
923 {
924     NvmeCtrl *s = NVME(obj);
925     int32_t boot_index;
926     Error *local_err = NULL;
927
928     visit_type_int32(v, &boot_index, name, &local_err);
929     if (local_err) {
930         goto out;
931     }
932     /* check whether bootindex is present in fw_boot_order list  */
933     check_boot_index(boot_index, &local_err);
934     if (local_err) {
935         goto out;
936     }
937     /* change bootindex to a new one */
938     s->conf.bootindex = boot_index;
939
940 out:
941     if (local_err) {
942         error_propagate(errp, local_err);
943     }
944 }
945
946 static void nvme_instance_init(Object *obj)
947 {
948     object_property_add(obj, "bootindex", "int32",
949                         nvme_get_bootindex,
950                         nvme_set_bootindex, NULL, NULL, NULL);
951     object_property_set_int(obj, -1, "bootindex", NULL);
952 }
953
954 static const TypeInfo nvme_info = {
955     .name          = "nvme",
956     .parent        = TYPE_PCI_DEVICE,
957     .instance_size = sizeof(NvmeCtrl),
958     .class_init    = nvme_class_init,
959     .instance_init = nvme_instance_init,
960 };
961
962 static void nvme_register_types(void)
963 {
964     type_register_static(&nvme_info);
965 }
966
967 type_init(nvme_register_types)