Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / drivers / block / xen-blkback / common.h
1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License version 2
4  * as published by the Free Software Foundation; or, when distributed
5  * separately from the Linux kernel or incorporated into other
6  * software packages, subject to the following license:
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this source file (the "Software"), to deal in the Software without
10  * restriction, including without limitation the rights to use, copy, modify,
11  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
12  * and to permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24  * IN THE SOFTWARE.
25  */
26
27 #ifndef __XEN_BLKIF__BACKEND__COMMON_H__
28 #define __XEN_BLKIF__BACKEND__COMMON_H__
29
30 #include <linux/module.h>
31 #include <linux/interrupt.h>
32 #include <linux/slab.h>
33 #include <linux/blkdev.h>
34 #include <linux/vmalloc.h>
35 #include <linux/wait.h>
36 #include <linux/io.h>
37 #include <linux/rbtree.h>
38 #include <asm/setup.h>
39 #include <asm/pgalloc.h>
40 #include <asm/hypervisor.h>
41 #include <xen/grant_table.h>
42 #include <xen/xenbus.h>
43 #include <xen/interface/io/ring.h>
44 #include <xen/interface/io/blkif.h>
45 #include <xen/interface/io/protocols.h>
46
47 /*
48  * This is the maximum number of segments that would be allowed in indirect
49  * requests. This value will also be passed to the frontend.
50  */
51 #define MAX_INDIRECT_SEGMENTS 256
52
53 #define SEGS_PER_INDIRECT_FRAME \
54         (PAGE_SIZE/sizeof(struct blkif_request_segment))
55 #define MAX_INDIRECT_PAGES \
56         ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
57 #define INDIRECT_PAGES(_segs) \
58         ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
59
60 /* Not a real protocol.  Used to generate ring structs which contain
61  * the elements common to all protocols only.  This way we get a
62  * compiler-checkable way to use common struct elements, so we can
63  * avoid using switch(protocol) in a number of places.  */
64 struct blkif_common_request {
65         char dummy;
66 };
67 struct blkif_common_response {
68         char dummy;
69 };
70
71 struct blkif_x86_32_request_rw {
72         uint8_t        nr_segments;  /* number of segments                   */
73         blkif_vdev_t   handle;       /* only for read/write requests         */
74         uint64_t       id;           /* private guest value, echoed in resp  */
75         blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
76         struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
77 } __attribute__((__packed__));
78
79 struct blkif_x86_32_request_discard {
80         uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
81         blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
82         uint64_t       id;           /* private guest value, echoed in resp  */
83         blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
84         uint64_t       nr_sectors;
85 } __attribute__((__packed__));
86
87 struct blkif_x86_32_request_other {
88         uint8_t        _pad1;
89         blkif_vdev_t   _pad2;
90         uint64_t       id;           /* private guest value, echoed in resp  */
91 } __attribute__((__packed__));
92
93 struct blkif_x86_32_request_indirect {
94         uint8_t        indirect_op;
95         uint16_t       nr_segments;
96         uint64_t       id;
97         blkif_sector_t sector_number;
98         blkif_vdev_t   handle;
99         uint16_t       _pad1;
100         grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
101         /*
102          * The maximum number of indirect segments (and pages) that will
103          * be used is determined by MAX_INDIRECT_SEGMENTS, this value
104          * is also exported to the guest (via xenstore
105          * feature-max-indirect-segments entry), so the frontend knows how
106          * many indirect segments the backend supports.
107          */
108         uint64_t       _pad2;        /* make it 64 byte aligned */
109 } __attribute__((__packed__));
110
111 struct blkif_x86_32_request {
112         uint8_t        operation;    /* BLKIF_OP_???                         */
113         union {
114                 struct blkif_x86_32_request_rw rw;
115                 struct blkif_x86_32_request_discard discard;
116                 struct blkif_x86_32_request_other other;
117                 struct blkif_x86_32_request_indirect indirect;
118         } u;
119 } __attribute__((__packed__));
120
121 /* i386 protocol version */
122 #pragma pack(push, 4)
123 struct blkif_x86_32_response {
124         uint64_t        id;              /* copied from request */
125         uint8_t         operation;       /* copied from request */
126         int16_t         status;          /* BLKIF_RSP_???       */
127 };
128 #pragma pack(pop)
129 /* x86_64 protocol version */
130
131 struct blkif_x86_64_request_rw {
132         uint8_t        nr_segments;  /* number of segments                   */
133         blkif_vdev_t   handle;       /* only for read/write requests         */
134         uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
135         uint64_t       id;
136         blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
137         struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
138 } __attribute__((__packed__));
139
140 struct blkif_x86_64_request_discard {
141         uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
142         blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
143         uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
144         uint64_t       id;
145         blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
146         uint64_t       nr_sectors;
147 } __attribute__((__packed__));
148
149 struct blkif_x86_64_request_other {
150         uint8_t        _pad1;
151         blkif_vdev_t   _pad2;
152         uint32_t       _pad3;        /* offsetof(blkif_..,u.discard.id)==8   */
153         uint64_t       id;           /* private guest value, echoed in resp  */
154 } __attribute__((__packed__));
155
156 struct blkif_x86_64_request_indirect {
157         uint8_t        indirect_op;
158         uint16_t       nr_segments;
159         uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
160         uint64_t       id;
161         blkif_sector_t sector_number;
162         blkif_vdev_t   handle;
163         uint16_t       _pad2;
164         grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
165         /*
166          * The maximum number of indirect segments (and pages) that will
167          * be used is determined by MAX_INDIRECT_SEGMENTS, this value
168          * is also exported to the guest (via xenstore
169          * feature-max-indirect-segments entry), so the frontend knows how
170          * many indirect segments the backend supports.
171          */
172         uint32_t       _pad3;        /* make it 64 byte aligned */
173 } __attribute__((__packed__));
174
175 struct blkif_x86_64_request {
176         uint8_t        operation;    /* BLKIF_OP_???                         */
177         union {
178                 struct blkif_x86_64_request_rw rw;
179                 struct blkif_x86_64_request_discard discard;
180                 struct blkif_x86_64_request_other other;
181                 struct blkif_x86_64_request_indirect indirect;
182         } u;
183 } __attribute__((__packed__));
184
185 struct blkif_x86_64_response {
186         uint64_t       __attribute__((__aligned__(8))) id;
187         uint8_t         operation;       /* copied from request */
188         int16_t         status;          /* BLKIF_RSP_???       */
189 };
190
191 DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
192                   struct blkif_common_response);
193 DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
194                   struct blkif_x86_32_response);
195 DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
196                   struct blkif_x86_64_response);
197
198 union blkif_back_rings {
199         struct blkif_back_ring        native;
200         struct blkif_common_back_ring common;
201         struct blkif_x86_32_back_ring x86_32;
202         struct blkif_x86_64_back_ring x86_64;
203 };
204
205 enum blkif_protocol {
206         BLKIF_PROTOCOL_NATIVE = 1,
207         BLKIF_PROTOCOL_X86_32 = 2,
208         BLKIF_PROTOCOL_X86_64 = 3,
209 };
210
211 /*
212  * Default protocol if the frontend doesn't specify one.
213  */
214 #ifdef CONFIG_X86
215 #  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
216 #else
217 #  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
218 #endif
219
220 struct xen_vbd {
221         /* What the domain refers to this vbd as. */
222         blkif_vdev_t            handle;
223         /* Non-zero -> read-only */
224         unsigned char           readonly;
225         /* VDISK_xxx */
226         unsigned char           type;
227         /* phys device that this vbd maps to. */
228         u32                     pdevice;
229         struct block_device     *bdev;
230         /* Cached size parameter. */
231         sector_t                size;
232         unsigned int            flush_support:1;
233         unsigned int            discard_secure:1;
234         unsigned int            feature_gnt_persistent:1;
235         unsigned int            overflow_max_grants:1;
236 };
237
238 struct backend_info;
239
240 /* Number of available flags */
241 #define PERSISTENT_GNT_FLAGS_SIZE       2
242 /* This persistent grant is currently in use */
243 #define PERSISTENT_GNT_ACTIVE           0
244 /*
245  * This persistent grant has been used, this flag is set when we remove the
246  * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
247  */
248 #define PERSISTENT_GNT_WAS_ACTIVE       1
249
250 /* Number of requests that we can fit in a ring */
251 #define XEN_BLKIF_REQS                  32
252
253 struct persistent_gnt {
254         struct page *page;
255         grant_ref_t gnt;
256         grant_handle_t handle;
257         DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
258         struct rb_node node;
259         struct list_head remove_node;
260 };
261
262 struct xen_blkif {
263         /* Unique identifier for this interface. */
264         domid_t                 domid;
265         unsigned int            handle;
266         /* Physical parameters of the comms window. */
267         unsigned int            irq;
268         /* Comms information. */
269         enum blkif_protocol     blk_protocol;
270         union blkif_back_rings  blk_rings;
271         void                    *blk_ring;
272         /* The VBD attached to this interface. */
273         struct xen_vbd          vbd;
274         /* Back pointer to the backend_info. */
275         struct backend_info     *be;
276         /* Private fields. */
277         spinlock_t              blk_ring_lock;
278         atomic_t                refcnt;
279
280         wait_queue_head_t       wq;
281         /* for barrier (drain) requests */
282         struct completion       drain_complete;
283         atomic_t                drain;
284         atomic_t                inflight;
285         /* One thread per one blkif. */
286         struct task_struct      *xenblkd;
287         unsigned int            waiting_reqs;
288
289         /* tree to store persistent grants */
290         struct rb_root          persistent_gnts;
291         unsigned int            persistent_gnt_c;
292         atomic_t                persistent_gnt_in_use;
293         unsigned long           next_lru;
294
295         /* used by the kworker that offload work from the persistent purge */
296         struct list_head        persistent_purge_list;
297         struct work_struct      persistent_purge_work;
298
299         /* buffer of free pages to map grant refs */
300         spinlock_t              free_pages_lock;
301         int                     free_pages_num;
302         struct list_head        free_pages;
303
304         /* List of all 'pending_req' available */
305         struct list_head        pending_free;
306         /* And its spinlock. */
307         spinlock_t              pending_free_lock;
308         wait_queue_head_t       pending_free_wq;
309
310         /* statistics */
311         unsigned long           st_print;
312         unsigned long long                      st_rd_req;
313         unsigned long long                      st_wr_req;
314         unsigned long long                      st_oo_req;
315         unsigned long long                      st_f_req;
316         unsigned long long                      st_ds_req;
317         unsigned long long                      st_rd_sect;
318         unsigned long long                      st_wr_sect;
319
320         struct work_struct      free_work;
321         /* Thread shutdown wait queue. */
322         wait_queue_head_t       shutdown_wq;
323 };
324
325 struct seg_buf {
326         unsigned long offset;
327         unsigned int nsec;
328 };
329
330 struct grant_page {
331         struct page             *page;
332         struct persistent_gnt   *persistent_gnt;
333         grant_handle_t          handle;
334         grant_ref_t             gref;
335 };
336
337 /*
338  * Each outstanding request that we've passed to the lower device layers has a
339  * 'pending_req' allocated to it. Each buffer_head that completes decrements
340  * the pendcnt towards zero. When it hits zero, the specified domain has a
341  * response queued for it, with the saved 'id' passed back.
342  */
343 struct pending_req {
344         struct xen_blkif        *blkif;
345         u64                     id;
346         int                     nr_pages;
347         atomic_t                pendcnt;
348         unsigned short          operation;
349         int                     status;
350         struct list_head        free_list;
351         struct grant_page       *segments[MAX_INDIRECT_SEGMENTS];
352         /* Indirect descriptors */
353         struct grant_page       *indirect_pages[MAX_INDIRECT_PAGES];
354         struct seg_buf          seg[MAX_INDIRECT_SEGMENTS];
355         struct bio              *biolist[MAX_INDIRECT_SEGMENTS];
356         struct gnttab_unmap_grant_ref unmap[MAX_INDIRECT_SEGMENTS];
357         struct page                   *unmap_pages[MAX_INDIRECT_SEGMENTS];
358         struct gntab_unmap_queue_data gnttab_unmap_data;
359 };
360
361
362 #define vbd_sz(_v)      ((_v)->bdev->bd_part ? \
363                          (_v)->bdev->bd_part->nr_sects : \
364                           get_capacity((_v)->bdev->bd_disk))
365
366 #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
367 #define xen_blkif_put(_b)                               \
368         do {                                            \
369                 if (atomic_dec_and_test(&(_b)->refcnt)) \
370                         schedule_work(&(_b)->free_work);\
371         } while (0)
372
373 struct phys_req {
374         unsigned short          dev;
375         blkif_sector_t          nr_sects;
376         struct block_device     *bdev;
377         blkif_sector_t          sector_number;
378 };
379 int xen_blkif_interface_init(void);
380
381 int xen_blkif_xenbus_init(void);
382
383 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
384 int xen_blkif_schedule(void *arg);
385 int xen_blkif_purge_persistent(void *arg);
386 void xen_blkbk_free_caches(struct xen_blkif *blkif);
387
388 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
389                               struct backend_info *be, int state);
390
391 int xen_blkbk_barrier(struct xenbus_transaction xbt,
392                       struct backend_info *be, int state);
393 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
394 void xen_blkbk_unmap_purged_grants(struct work_struct *work);
395
396 static inline void blkif_get_x86_32_req(struct blkif_request *dst,
397                                         struct blkif_x86_32_request *src)
398 {
399         int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
400         dst->operation = src->operation;
401         switch (src->operation) {
402         case BLKIF_OP_READ:
403         case BLKIF_OP_WRITE:
404         case BLKIF_OP_WRITE_BARRIER:
405         case BLKIF_OP_FLUSH_DISKCACHE:
406                 dst->u.rw.nr_segments = src->u.rw.nr_segments;
407                 dst->u.rw.handle = src->u.rw.handle;
408                 dst->u.rw.id = src->u.rw.id;
409                 dst->u.rw.sector_number = src->u.rw.sector_number;
410                 barrier();
411                 if (n > dst->u.rw.nr_segments)
412                         n = dst->u.rw.nr_segments;
413                 for (i = 0; i < n; i++)
414                         dst->u.rw.seg[i] = src->u.rw.seg[i];
415                 break;
416         case BLKIF_OP_DISCARD:
417                 dst->u.discard.flag = src->u.discard.flag;
418                 dst->u.discard.id = src->u.discard.id;
419                 dst->u.discard.sector_number = src->u.discard.sector_number;
420                 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
421                 break;
422         case BLKIF_OP_INDIRECT:
423                 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
424                 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
425                 dst->u.indirect.handle = src->u.indirect.handle;
426                 dst->u.indirect.id = src->u.indirect.id;
427                 dst->u.indirect.sector_number = src->u.indirect.sector_number;
428                 barrier();
429                 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
430                 for (i = 0; i < j; i++)
431                         dst->u.indirect.indirect_grefs[i] =
432                                 src->u.indirect.indirect_grefs[i];
433                 break;
434         default:
435                 /*
436                  * Don't know how to translate this op. Only get the
437                  * ID so failure can be reported to the frontend.
438                  */
439                 dst->u.other.id = src->u.other.id;
440                 break;
441         }
442 }
443
444 static inline void blkif_get_x86_64_req(struct blkif_request *dst,
445                                         struct blkif_x86_64_request *src)
446 {
447         int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
448         dst->operation = src->operation;
449         switch (src->operation) {
450         case BLKIF_OP_READ:
451         case BLKIF_OP_WRITE:
452         case BLKIF_OP_WRITE_BARRIER:
453         case BLKIF_OP_FLUSH_DISKCACHE:
454                 dst->u.rw.nr_segments = src->u.rw.nr_segments;
455                 dst->u.rw.handle = src->u.rw.handle;
456                 dst->u.rw.id = src->u.rw.id;
457                 dst->u.rw.sector_number = src->u.rw.sector_number;
458                 barrier();
459                 if (n > dst->u.rw.nr_segments)
460                         n = dst->u.rw.nr_segments;
461                 for (i = 0; i < n; i++)
462                         dst->u.rw.seg[i] = src->u.rw.seg[i];
463                 break;
464         case BLKIF_OP_DISCARD:
465                 dst->u.discard.flag = src->u.discard.flag;
466                 dst->u.discard.id = src->u.discard.id;
467                 dst->u.discard.sector_number = src->u.discard.sector_number;
468                 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
469                 break;
470         case BLKIF_OP_INDIRECT:
471                 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
472                 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
473                 dst->u.indirect.handle = src->u.indirect.handle;
474                 dst->u.indirect.id = src->u.indirect.id;
475                 dst->u.indirect.sector_number = src->u.indirect.sector_number;
476                 barrier();
477                 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
478                 for (i = 0; i < j; i++)
479                         dst->u.indirect.indirect_grefs[i] =
480                                 src->u.indirect.indirect_grefs[i];
481                 break;
482         default:
483                 /*
484                  * Don't know how to translate this op. Only get the
485                  * ID so failure can be reported to the frontend.
486                  */
487                 dst->u.other.id = src->u.other.id;
488                 break;
489         }
490 }
491
492 #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */