Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / drivers / staging / lustre / lustre / libcfs / workitem.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * libcfs/libcfs/workitem.c
37  *
38  * Author: Isaac Huang <isaac@clusterfs.com>
39  *       Liang Zhen  <zhen.liang@sun.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_LNET
43
44 #include "../../include/linux/libcfs/libcfs.h"
45
46 #define CFS_WS_NAME_LEN  16
47
48 typedef struct cfs_wi_sched {
49         struct list_head                ws_list;        /* chain on global list */
50         /** serialised workitems */
51         spinlock_t              ws_lock;
52         /** where schedulers sleep */
53         wait_queue_head_t               ws_waitq;
54         /** concurrent workitems */
55         struct list_head                ws_runq;
56         /** rescheduled running-workitems, a workitem can be rescheduled
57          * while running in wi_action(), but we don't to execute it again
58          * unless it returns from wi_action(), so we put it on ws_rerunq
59          * while rescheduling, and move it to runq after it returns
60          * from wi_action() */
61         struct list_head                ws_rerunq;
62         /** CPT-table for this scheduler */
63         struct cfs_cpt_table    *ws_cptab;
64         /** CPT id for affinity */
65         int                     ws_cpt;
66         /** number of scheduled workitems */
67         int                     ws_nscheduled;
68         /** started scheduler thread, protected by cfs_wi_data::wi_glock */
69         unsigned int            ws_nthreads:30;
70         /** shutting down, protected by cfs_wi_data::wi_glock */
71         unsigned int            ws_stopping:1;
72         /** serialize starting thread, protected by cfs_wi_data::wi_glock */
73         unsigned int            ws_starting:1;
74         /** scheduler name */
75         char                    ws_name[CFS_WS_NAME_LEN];
76 } cfs_wi_sched_t;
77
78 static struct cfs_workitem_data {
79         /** serialize */
80         spinlock_t              wi_glock;
81         /** list of all schedulers */
82         struct list_head                wi_scheds;
83         /** WI module is initialized */
84         int                     wi_init;
85         /** shutting down the whole WI module */
86         int                     wi_stopping;
87 } cfs_wi_data;
88
89 static inline void
90 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
91 {
92         spin_lock(&sched->ws_lock);
93 }
94
95 static inline void
96 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
97 {
98         spin_unlock(&sched->ws_lock);
99 }
100
101 static inline int
102 cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
103 {
104         cfs_wi_sched_lock(sched);
105         if (sched->ws_stopping) {
106                 cfs_wi_sched_unlock(sched);
107                 return 0;
108         }
109
110         if (!list_empty(&sched->ws_runq)) {
111                 cfs_wi_sched_unlock(sched);
112                 return 0;
113         }
114         cfs_wi_sched_unlock(sched);
115         return 1;
116 }
117
118
119 /* XXX:
120  * 0. it only works when called from wi->wi_action.
121  * 1. when it returns no one shall try to schedule the workitem.
122  */
123 void
124 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
125 {
126         LASSERT(!in_interrupt()); /* because we use plain spinlock */
127         LASSERT(!sched->ws_stopping);
128
129         cfs_wi_sched_lock(sched);
130
131         LASSERT(wi->wi_running);
132         if (wi->wi_scheduled) { /* cancel pending schedules */
133                 LASSERT(!list_empty(&wi->wi_list));
134                 list_del_init(&wi->wi_list);
135
136                 LASSERT(sched->ws_nscheduled > 0);
137                 sched->ws_nscheduled--;
138         }
139
140         LASSERT(list_empty(&wi->wi_list));
141
142         wi->wi_scheduled = 1; /* LBUG future schedule attempts */
143         cfs_wi_sched_unlock(sched);
144
145         return;
146 }
147 EXPORT_SYMBOL(cfs_wi_exit);
148
149 /**
150  * cancel schedule request of workitem \a wi
151  */
152 int
153 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
154 {
155         int     rc;
156
157         LASSERT(!in_interrupt()); /* because we use plain spinlock */
158         LASSERT(!sched->ws_stopping);
159
160         /*
161          * return 0 if it's running already, otherwise return 1, which
162          * means the workitem will not be scheduled and will not have
163          * any race with wi_action.
164          */
165         cfs_wi_sched_lock(sched);
166
167         rc = !(wi->wi_running);
168
169         if (wi->wi_scheduled) { /* cancel pending schedules */
170                 LASSERT(!list_empty(&wi->wi_list));
171                 list_del_init(&wi->wi_list);
172
173                 LASSERT(sched->ws_nscheduled > 0);
174                 sched->ws_nscheduled--;
175
176                 wi->wi_scheduled = 0;
177         }
178
179         LASSERT (list_empty(&wi->wi_list));
180
181         cfs_wi_sched_unlock(sched);
182         return rc;
183 }
184 EXPORT_SYMBOL(cfs_wi_deschedule);
185
186 /*
187  * Workitem scheduled with (serial == 1) is strictly serialised not only with
188  * itself, but also with others scheduled this way.
189  *
190  * Now there's only one static serialised queue, but in the future more might
191  * be added, and even dynamic creation of serialised queues might be supported.
192  */
193 void
194 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
195 {
196         LASSERT(!in_interrupt()); /* because we use plain spinlock */
197         LASSERT(!sched->ws_stopping);
198
199         cfs_wi_sched_lock(sched);
200
201         if (!wi->wi_scheduled) {
202                 LASSERT (list_empty(&wi->wi_list));
203
204                 wi->wi_scheduled = 1;
205                 sched->ws_nscheduled++;
206                 if (!wi->wi_running) {
207                         list_add_tail(&wi->wi_list, &sched->ws_runq);
208                         wake_up(&sched->ws_waitq);
209                 } else {
210                         list_add(&wi->wi_list, &sched->ws_rerunq);
211                 }
212         }
213
214         LASSERT (!list_empty(&wi->wi_list));
215         cfs_wi_sched_unlock(sched);
216         return;
217 }
218 EXPORT_SYMBOL(cfs_wi_schedule);
219
220
221 static int
222 cfs_wi_scheduler (void *arg)
223 {
224         struct cfs_wi_sched     *sched = (cfs_wi_sched_t *)arg;
225
226         cfs_block_allsigs();
227
228         /* CPT affinity scheduler? */
229         if (sched->ws_cptab != NULL)
230                 cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
231
232         spin_lock(&cfs_wi_data.wi_glock);
233
234         LASSERT(sched->ws_starting == 1);
235         sched->ws_starting--;
236         sched->ws_nthreads++;
237
238         spin_unlock(&cfs_wi_data.wi_glock);
239
240         cfs_wi_sched_lock(sched);
241
242         while (!sched->ws_stopping) {
243                 int          nloops = 0;
244                 int          rc;
245                 cfs_workitem_t *wi;
246
247                 while (!list_empty(&sched->ws_runq) &&
248                        nloops < CFS_WI_RESCHED) {
249                         wi = list_entry(sched->ws_runq.next,
250                                             cfs_workitem_t, wi_list);
251                         LASSERT(wi->wi_scheduled && !wi->wi_running);
252
253                         list_del_init(&wi->wi_list);
254
255                         LASSERT(sched->ws_nscheduled > 0);
256                         sched->ws_nscheduled--;
257
258                         wi->wi_running   = 1;
259                         wi->wi_scheduled = 0;
260
261
262                         cfs_wi_sched_unlock(sched);
263                         nloops++;
264
265                         rc = (*wi->wi_action) (wi);
266
267                         cfs_wi_sched_lock(sched);
268                         if (rc != 0) /* WI should be dead, even be freed! */
269                                 continue;
270
271                         wi->wi_running = 0;
272                         if (list_empty(&wi->wi_list))
273                                 continue;
274
275                         LASSERT(wi->wi_scheduled);
276                         /* wi is rescheduled, should be on rerunq now, we
277                          * move it to runq so it can run action now */
278                         list_move_tail(&wi->wi_list, &sched->ws_runq);
279                 }
280
281                 if (!list_empty(&sched->ws_runq)) {
282                         cfs_wi_sched_unlock(sched);
283                         /* don't sleep because some workitems still
284                          * expect me to come back soon */
285                         cond_resched();
286                         cfs_wi_sched_lock(sched);
287                         continue;
288                 }
289
290                 cfs_wi_sched_unlock(sched);
291                 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
292                                                 !cfs_wi_sched_cansleep(sched));
293                 cfs_wi_sched_lock(sched);
294         }
295
296         cfs_wi_sched_unlock(sched);
297
298         spin_lock(&cfs_wi_data.wi_glock);
299         sched->ws_nthreads--;
300         spin_unlock(&cfs_wi_data.wi_glock);
301
302         return 0;
303 }
304
305
306 void
307 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
308 {
309         int     i;
310
311         LASSERT(cfs_wi_data.wi_init);
312         LASSERT(!cfs_wi_data.wi_stopping);
313
314         spin_lock(&cfs_wi_data.wi_glock);
315         if (sched->ws_stopping) {
316                 CDEBUG(D_INFO, "%s is in progress of stopping\n",
317                        sched->ws_name);
318                 spin_unlock(&cfs_wi_data.wi_glock);
319                 return;
320         }
321
322         LASSERT(!list_empty(&sched->ws_list));
323         sched->ws_stopping = 1;
324
325         spin_unlock(&cfs_wi_data.wi_glock);
326
327         i = 2;
328         wake_up_all(&sched->ws_waitq);
329
330         spin_lock(&cfs_wi_data.wi_glock);
331         while (sched->ws_nthreads > 0) {
332                 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
333                        "waiting for %d threads of WI sched[%s] to terminate\n",
334                        sched->ws_nthreads, sched->ws_name);
335
336                 spin_unlock(&cfs_wi_data.wi_glock);
337                 set_current_state(TASK_UNINTERRUPTIBLE);
338                 schedule_timeout(cfs_time_seconds(1) / 20);
339                 spin_lock(&cfs_wi_data.wi_glock);
340         }
341
342         list_del(&sched->ws_list);
343
344         spin_unlock(&cfs_wi_data.wi_glock);
345         LASSERT(sched->ws_nscheduled == 0);
346
347         LIBCFS_FREE(sched, sizeof(*sched));
348 }
349 EXPORT_SYMBOL(cfs_wi_sched_destroy);
350
351 int
352 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
353                     int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
354 {
355         struct cfs_wi_sched     *sched;
356         int                     rc;
357
358         LASSERT(cfs_wi_data.wi_init);
359         LASSERT(!cfs_wi_data.wi_stopping);
360         LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
361                 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
362
363         LIBCFS_ALLOC(sched, sizeof(*sched));
364         if (sched == NULL)
365                 return -ENOMEM;
366
367         strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
368         sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
369         sched->ws_cptab = cptab;
370         sched->ws_cpt = cpt;
371
372         spin_lock_init(&sched->ws_lock);
373         init_waitqueue_head(&sched->ws_waitq);
374         INIT_LIST_HEAD(&sched->ws_runq);
375         INIT_LIST_HEAD(&sched->ws_rerunq);
376         INIT_LIST_HEAD(&sched->ws_list);
377
378         rc = 0;
379         while (nthrs > 0)  {
380                 char    name[16];
381                 struct task_struct *task;
382
383                 spin_lock(&cfs_wi_data.wi_glock);
384                 while (sched->ws_starting > 0) {
385                         spin_unlock(&cfs_wi_data.wi_glock);
386                         schedule();
387                         spin_lock(&cfs_wi_data.wi_glock);
388                 }
389
390                 sched->ws_starting++;
391                 spin_unlock(&cfs_wi_data.wi_glock);
392
393                 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
394                         snprintf(name, sizeof(name), "%s_%02d_%02u",
395                                  sched->ws_name, sched->ws_cpt,
396                                  sched->ws_nthreads);
397                 } else {
398                         snprintf(name, sizeof(name), "%s_%02u",
399                                  sched->ws_name, sched->ws_nthreads);
400                 }
401
402                 task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
403                 if (!IS_ERR(task)) {
404                         nthrs--;
405                         continue;
406                 }
407                 rc = PTR_ERR(task);
408
409                 CERROR("Failed to create thread for WI scheduler %s: %d\n",
410                        name, rc);
411
412                 spin_lock(&cfs_wi_data.wi_glock);
413
414                 /* make up for cfs_wi_sched_destroy */
415                 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
416                 sched->ws_starting--;
417
418                 spin_unlock(&cfs_wi_data.wi_glock);
419
420                 cfs_wi_sched_destroy(sched);
421                 return rc;
422         }
423         spin_lock(&cfs_wi_data.wi_glock);
424         list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
425         spin_unlock(&cfs_wi_data.wi_glock);
426
427         *sched_pp = sched;
428         return 0;
429 }
430 EXPORT_SYMBOL(cfs_wi_sched_create);
431
432 int
433 cfs_wi_startup(void)
434 {
435         memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
436
437         spin_lock_init(&cfs_wi_data.wi_glock);
438         INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
439         cfs_wi_data.wi_init = 1;
440
441         return 0;
442 }
443
444 void
445 cfs_wi_shutdown(void)
446 {
447         struct cfs_wi_sched     *sched;
448
449         spin_lock(&cfs_wi_data.wi_glock);
450         cfs_wi_data.wi_stopping = 1;
451         spin_unlock(&cfs_wi_data.wi_glock);
452
453         /* nobody should contend on this list */
454         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
455                 sched->ws_stopping = 1;
456                 wake_up_all(&sched->ws_waitq);
457         }
458
459         list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
460                 spin_lock(&cfs_wi_data.wi_glock);
461
462                 while (sched->ws_nthreads != 0) {
463                         spin_unlock(&cfs_wi_data.wi_glock);
464                         set_current_state(TASK_UNINTERRUPTIBLE);
465                         schedule_timeout(cfs_time_seconds(1) / 20);
466                         spin_lock(&cfs_wi_data.wi_glock);
467                 }
468                 spin_unlock(&cfs_wi_data.wi_glock);
469         }
470         while (!list_empty(&cfs_wi_data.wi_scheds)) {
471                 sched = list_entry(cfs_wi_data.wi_scheds.next,
472                                        struct cfs_wi_sched, ws_list);
473                 list_del(&sched->ws_list);
474                 LIBCFS_FREE(sched, sizeof(*sched));
475         }
476
477         cfs_wi_data.wi_stopping = 0;
478         cfs_wi_data.wi_init = 0;
479 }