Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / drivers / staging / lustre / lustre / ptlrpc / client.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  */
36
37 /** Implementation of client-side PortalRPC interfaces */
38
39 #define DEBUG_SUBSYSTEM S_RPC
40
41 #include "../include/obd_support.h"
42 #include "../include/obd_class.h"
43 #include "../include/lustre_lib.h"
44 #include "../include/lustre_ha.h"
45 #include "../include/lustre_import.h"
46 #include "../include/lustre_req_layout.h"
47
48 #include "ptlrpc_internal.h"
49
50 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
51 static int ptlrpcd_check_work(struct ptlrpc_request *req);
52
53 /**
54  * Initialize passed in client structure \a cl.
55  */
56 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
57                         struct ptlrpc_client *cl)
58 {
59         cl->cli_request_portal = req_portal;
60         cl->cli_reply_portal   = rep_portal;
61         cl->cli_name       = name;
62 }
63 EXPORT_SYMBOL(ptlrpc_init_client);
64
65 /**
66  * Return PortalRPC connection for remote uud \a uuid
67  */
68 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
69 {
70         struct ptlrpc_connection *c;
71         lnet_nid_t              self;
72         lnet_process_id_t        peer;
73         int                    err;
74
75         /* ptlrpc_uuid_to_peer() initializes its 2nd parameter
76          * before accessing its values. */
77         /* coverity[uninit_use_in_call] */
78         err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
79         if (err != 0) {
80                 CNETERR("cannot find peer %s!\n", uuid->uuid);
81                 return NULL;
82         }
83
84         c = ptlrpc_connection_get(peer, self, uuid);
85         if (c) {
86                 memcpy(c->c_remote_uuid.uuid,
87                        uuid->uuid, sizeof(c->c_remote_uuid.uuid));
88         }
89
90         CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
91
92         return c;
93 }
94 EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
95
96 /**
97  * Allocate and initialize new bulk descriptor on the sender.
98  * Returns pointer to the descriptor or NULL on error.
99  */
100 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
101                                          unsigned type, unsigned portal)
102 {
103         struct ptlrpc_bulk_desc *desc;
104         int i;
105
106         OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
107         if (!desc)
108                 return NULL;
109
110         spin_lock_init(&desc->bd_lock);
111         init_waitqueue_head(&desc->bd_waitq);
112         desc->bd_max_iov = npages;
113         desc->bd_iov_count = 0;
114         desc->bd_portal = portal;
115         desc->bd_type = type;
116         desc->bd_md_count = 0;
117         LASSERT(max_brw > 0);
118         desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
119         /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
120          * node. Negotiated ocd_brw_size will always be <= this number. */
121         for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
122                 LNetInvalidateHandle(&desc->bd_mds[i]);
123
124         return desc;
125 }
126
127 /**
128  * Prepare bulk descriptor for specified outgoing request \a req that
129  * can fit \a npages * pages. \a type is bulk type. \a portal is where
130  * the bulk to be sent. Used on client-side.
131  * Returns pointer to newly allocated initialized bulk descriptor or NULL on
132  * error.
133  */
134 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
135                                               unsigned npages, unsigned max_brw,
136                                               unsigned type, unsigned portal)
137 {
138         struct obd_import *imp = req->rq_import;
139         struct ptlrpc_bulk_desc *desc;
140
141         LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
142         desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
143         if (desc == NULL)
144                 return NULL;
145
146         desc->bd_import_generation = req->rq_import_generation;
147         desc->bd_import = class_import_get(imp);
148         desc->bd_req = req;
149
150         desc->bd_cbid.cbid_fn  = client_bulk_callback;
151         desc->bd_cbid.cbid_arg = desc;
152
153         /* This makes req own desc, and free it when she frees herself */
154         req->rq_bulk = desc;
155
156         return desc;
157 }
158 EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
159
160 /**
161  * Add a page \a page to the bulk descriptor \a desc.
162  * Data to transfer in the page starts at offset \a pageoffset and
163  * amount of data to transfer from the page is \a len
164  */
165 void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
166                              struct page *page, int pageoffset, int len, int pin)
167 {
168         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
169         LASSERT(page != NULL);
170         LASSERT(pageoffset >= 0);
171         LASSERT(len > 0);
172         LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
173
174         desc->bd_nob += len;
175
176         if (pin)
177                 page_cache_get(page);
178
179         ptlrpc_add_bulk_page(desc, page, pageoffset, len);
180 }
181 EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
182
183 /**
184  * Uninitialize and free bulk descriptor \a desc.
185  * Works on bulk descriptors both from server and client side.
186  */
187 void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
188 {
189         int i;
190
191         LASSERT(desc != NULL);
192         LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
193         LASSERT(desc->bd_md_count == 0);         /* network hands off */
194         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
195
196         sptlrpc_enc_pool_put_pages(desc);
197
198         if (desc->bd_export)
199                 class_export_put(desc->bd_export);
200         else
201                 class_import_put(desc->bd_import);
202
203         if (unpin) {
204                 for (i = 0; i < desc->bd_iov_count; i++)
205                         page_cache_release(desc->bd_iov[i].kiov_page);
206         }
207
208         OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
209                                 bd_iov[desc->bd_max_iov]));
210 }
211 EXPORT_SYMBOL(__ptlrpc_free_bulk);
212
213 /**
214  * Set server timelimit for this req, i.e. how long are we willing to wait
215  * for reply before timing out this request.
216  */
217 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
218 {
219         __u32 serv_est;
220         int idx;
221         struct imp_at *at;
222
223         LASSERT(req->rq_import);
224
225         if (AT_OFF) {
226                 /* non-AT settings */
227                 /**
228                  * \a imp_server_timeout means this is reverse import and
229                  * we send (currently only) ASTs to the client and cannot afford
230                  * to wait too long for the reply, otherwise the other client
231                  * (because of which we are sending this request) would
232                  * timeout waiting for us
233                  */
234                 req->rq_timeout = req->rq_import->imp_server_timeout ?
235                                   obd_timeout / 2 : obd_timeout;
236         } else {
237                 at = &req->rq_import->imp_at;
238                 idx = import_at_get_index(req->rq_import,
239                                           req->rq_request_portal);
240                 serv_est = at_get(&at->iat_service_estimate[idx]);
241                 req->rq_timeout = at_est2timeout(serv_est);
242         }
243         /* We could get even fancier here, using history to predict increased
244            loading... */
245
246         /* Let the server know what this RPC timeout is by putting it in the
247            reqmsg*/
248         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
249 }
250 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
251
252 /* Adjust max service estimate based on server value */
253 static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
254                                   unsigned int serv_est)
255 {
256         int idx;
257         unsigned int oldse;
258         struct imp_at *at;
259
260         LASSERT(req->rq_import);
261         at = &req->rq_import->imp_at;
262
263         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
264         /* max service estimates are tracked on the server side,
265            so just keep minimal history here */
266         oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
267         if (oldse != 0)
268                 CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
269                        req->rq_import->imp_obd->obd_name, req->rq_request_portal,
270                        oldse, at_get(&at->iat_service_estimate[idx]));
271 }
272
273 /* Expected network latency per remote node (secs) */
274 int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
275 {
276         return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
277 }
278
279 /* Adjust expected network latency */
280 static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
281                                       unsigned int service_time)
282 {
283         unsigned int nl, oldnl;
284         struct imp_at *at;
285         time_t now = get_seconds();
286
287         LASSERT(req->rq_import);
288
289         if (service_time > now - req->rq_sent + 3) {
290                 /* bz16408, however, this can also happen if early reply
291                  * is lost and client RPC is expired and resent, early reply
292                  * or reply of original RPC can still be fit in reply buffer
293                  * of resent RPC, now client is measuring time from the
294                  * resent time, but server sent back service time of original
295                  * RPC.
296                  */
297                 CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
298                        D_ADAPTTO : D_WARNING,
299                        "Reported service time %u > total measured time "
300                        CFS_DURATION_T"\n", service_time,
301                        cfs_time_sub(now, req->rq_sent));
302                 return;
303         }
304
305         /* Network latency is total time less server processing time */
306         nl = max_t(int, now - req->rq_sent -
307                         service_time, 0) + 1; /* st rounding */
308         at = &req->rq_import->imp_at;
309
310         oldnl = at_measured(&at->iat_net_latency, nl);
311         if (oldnl != 0)
312                 CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n",
313                        req->rq_import->imp_obd->obd_name,
314                        obd_uuid2str(
315                                &req->rq_import->imp_connection->c_remote_uuid),
316                        oldnl, at_get(&at->iat_net_latency));
317 }
318
319 static int unpack_reply(struct ptlrpc_request *req)
320 {
321         int rc;
322
323         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
324                 rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
325                 if (rc) {
326                         DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
327                         return -EPROTO;
328                 }
329         }
330
331         rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
332         if (rc) {
333                 DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
334                 return -EPROTO;
335         }
336         return 0;
337 }
338
339 /**
340  * Handle an early reply message, called with the rq_lock held.
341  * If anything goes wrong just ignore it - same as if it never happened
342  */
343 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
344 {
345         struct ptlrpc_request *early_req;
346         time_t           olddl;
347         int                 rc;
348
349         req->rq_early = 0;
350         spin_unlock(&req->rq_lock);
351
352         rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
353         if (rc) {
354                 spin_lock(&req->rq_lock);
355                 return rc;
356         }
357
358         rc = unpack_reply(early_req);
359         if (rc == 0) {
360                 /* Expecting to increase the service time estimate here */
361                 ptlrpc_at_adj_service(req,
362                         lustre_msg_get_timeout(early_req->rq_repmsg));
363                 ptlrpc_at_adj_net_latency(req,
364                         lustre_msg_get_service_time(early_req->rq_repmsg));
365         }
366
367         sptlrpc_cli_finish_early_reply(early_req);
368
369         if (rc != 0) {
370                 spin_lock(&req->rq_lock);
371                 return rc;
372         }
373
374         /* Adjust the local timeout for this req */
375         ptlrpc_at_set_req_timeout(req);
376
377         spin_lock(&req->rq_lock);
378         olddl = req->rq_deadline;
379         /* server assumes it now has rq_timeout from when it sent the
380          * early reply, so client should give it at least that long. */
381         req->rq_deadline = get_seconds() + req->rq_timeout +
382                            ptlrpc_at_get_net_latency(req);
383
384         DEBUG_REQ(D_ADAPTTO, req,
385                   "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)",
386                   req->rq_early_count,
387                   cfs_time_sub(req->rq_deadline, get_seconds()),
388                   cfs_time_sub(req->rq_deadline, olddl));
389
390         return rc;
391 }
392
393 struct kmem_cache *request_cache;
394
395 int ptlrpc_request_cache_init(void)
396 {
397         request_cache = kmem_cache_create("ptlrpc_cache",
398                                           sizeof(struct ptlrpc_request),
399                                           0, SLAB_HWCACHE_ALIGN, NULL);
400         return request_cache == NULL ? -ENOMEM : 0;
401 }
402
403 void ptlrpc_request_cache_fini(void)
404 {
405         kmem_cache_destroy(request_cache);
406 }
407
408 struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
409 {
410         struct ptlrpc_request *req;
411
412         OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
413         return req;
414 }
415
416 void ptlrpc_request_cache_free(struct ptlrpc_request *req)
417 {
418         OBD_SLAB_FREE_PTR(req, request_cache);
419 }
420
421 /**
422  * Wind down request pool \a pool.
423  * Frees all requests from the pool too
424  */
425 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
426 {
427         struct list_head *l, *tmp;
428         struct ptlrpc_request *req;
429
430         LASSERT(pool != NULL);
431
432         spin_lock(&pool->prp_lock);
433         list_for_each_safe(l, tmp, &pool->prp_req_list) {
434                 req = list_entry(l, struct ptlrpc_request, rq_list);
435                 list_del(&req->rq_list);
436                 LASSERT(req->rq_reqbuf);
437                 LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
438                 OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
439                 ptlrpc_request_cache_free(req);
440         }
441         spin_unlock(&pool->prp_lock);
442         OBD_FREE(pool, sizeof(*pool));
443 }
444 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
445
446 /**
447  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
448  */
449 void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
450 {
451         int i;
452         int size = 1;
453
454         while (size < pool->prp_rq_size)
455                 size <<= 1;
456
457         LASSERTF(list_empty(&pool->prp_req_list) ||
458                  size == pool->prp_rq_size,
459                  "Trying to change pool size with nonempty pool from %d to %d bytes\n",
460                  pool->prp_rq_size, size);
461
462         spin_lock(&pool->prp_lock);
463         pool->prp_rq_size = size;
464         for (i = 0; i < num_rq; i++) {
465                 struct ptlrpc_request *req;
466                 struct lustre_msg *msg;
467
468                 spin_unlock(&pool->prp_lock);
469                 req = ptlrpc_request_cache_alloc(GFP_NOFS);
470                 if (!req)
471                         return;
472                 OBD_ALLOC_LARGE(msg, size);
473                 if (!msg) {
474                         ptlrpc_request_cache_free(req);
475                         return;
476                 }
477                 req->rq_reqbuf = msg;
478                 req->rq_reqbuf_len = size;
479                 req->rq_pool = pool;
480                 spin_lock(&pool->prp_lock);
481                 list_add_tail(&req->rq_list, &pool->prp_req_list);
482         }
483         spin_unlock(&pool->prp_lock);
484 }
485 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
486
487 /**
488  * Create and initialize new request pool with given attributes:
489  * \a num_rq - initial number of requests to create for the pool
490  * \a msgsize - maximum message size possible for requests in thid pool
491  * \a populate_pool - function to be called when more requests need to be added
492  *                  to the pool
493  * Returns pointer to newly created pool or NULL on error.
494  */
495 struct ptlrpc_request_pool *
496 ptlrpc_init_rq_pool(int num_rq, int msgsize,
497                     void (*populate_pool)(struct ptlrpc_request_pool *, int))
498 {
499         struct ptlrpc_request_pool *pool;
500
501         OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
502         if (!pool)
503                 return NULL;
504
505         /* Request next power of two for the allocation, because internally
506            kernel would do exactly this */
507
508         spin_lock_init(&pool->prp_lock);
509         INIT_LIST_HEAD(&pool->prp_req_list);
510         pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
511         pool->prp_populate = populate_pool;
512
513         populate_pool(pool, num_rq);
514
515         if (list_empty(&pool->prp_req_list)) {
516                 /* have not allocated a single request for the pool */
517                 OBD_FREE(pool, sizeof(struct ptlrpc_request_pool));
518                 pool = NULL;
519         }
520         return pool;
521 }
522 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
523
524 /**
525  * Fetches one request from pool \a pool
526  */
527 static struct ptlrpc_request *
528 ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
529 {
530         struct ptlrpc_request *request;
531         struct lustre_msg *reqbuf;
532
533         if (!pool)
534                 return NULL;
535
536         spin_lock(&pool->prp_lock);
537
538         /* See if we have anything in a pool, and bail out if nothing,
539          * in writeout path, where this matters, this is safe to do, because
540          * nothing is lost in this case, and when some in-flight requests
541          * complete, this code will be called again. */
542         if (unlikely(list_empty(&pool->prp_req_list))) {
543                 spin_unlock(&pool->prp_lock);
544                 return NULL;
545         }
546
547         request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
548                                  rq_list);
549         list_del_init(&request->rq_list);
550         spin_unlock(&pool->prp_lock);
551
552         LASSERT(request->rq_reqbuf);
553         LASSERT(request->rq_pool);
554
555         reqbuf = request->rq_reqbuf;
556         memset(request, 0, sizeof(*request));
557         request->rq_reqbuf = reqbuf;
558         request->rq_reqbuf_len = pool->prp_rq_size;
559         request->rq_pool = pool;
560
561         return request;
562 }
563
564 /**
565  * Returns freed \a request to pool.
566  */
567 static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
568 {
569         struct ptlrpc_request_pool *pool = request->rq_pool;
570
571         spin_lock(&pool->prp_lock);
572         LASSERT(list_empty(&request->rq_list));
573         LASSERT(!request->rq_receiving_reply);
574         list_add_tail(&request->rq_list, &pool->prp_req_list);
575         spin_unlock(&pool->prp_lock);
576 }
577
578 static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
579                                       __u32 version, int opcode,
580                                       int count, __u32 *lengths, char **bufs,
581                                       struct ptlrpc_cli_ctx *ctx)
582 {
583         struct obd_import  *imp = request->rq_import;
584         int              rc;
585
586         if (unlikely(ctx))
587                 request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
588         else {
589                 rc = sptlrpc_req_get_ctx(request);
590                 if (rc)
591                         goto out_free;
592         }
593
594         sptlrpc_req_set_flavor(request, opcode);
595
596         rc = lustre_pack_request(request, imp->imp_msg_magic, count,
597                                  lengths, bufs);
598         if (rc) {
599                 LASSERT(!request->rq_pool);
600                 goto out_ctx;
601         }
602
603         lustre_msg_add_version(request->rq_reqmsg, version);
604         request->rq_send_state = LUSTRE_IMP_FULL;
605         request->rq_type = PTL_RPC_MSG_REQUEST;
606         request->rq_export = NULL;
607
608         request->rq_req_cbid.cbid_fn  = request_out_callback;
609         request->rq_req_cbid.cbid_arg = request;
610
611         request->rq_reply_cbid.cbid_fn  = reply_in_callback;
612         request->rq_reply_cbid.cbid_arg = request;
613
614         request->rq_reply_deadline = 0;
615         request->rq_phase = RQ_PHASE_NEW;
616         request->rq_next_phase = RQ_PHASE_UNDEFINED;
617
618         request->rq_request_portal = imp->imp_client->cli_request_portal;
619         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
620
621         ptlrpc_at_set_req_timeout(request);
622
623         spin_lock_init(&request->rq_lock);
624         INIT_LIST_HEAD(&request->rq_list);
625         INIT_LIST_HEAD(&request->rq_timed_list);
626         INIT_LIST_HEAD(&request->rq_replay_list);
627         INIT_LIST_HEAD(&request->rq_ctx_chain);
628         INIT_LIST_HEAD(&request->rq_set_chain);
629         INIT_LIST_HEAD(&request->rq_history_list);
630         INIT_LIST_HEAD(&request->rq_exp_list);
631         init_waitqueue_head(&request->rq_reply_waitq);
632         init_waitqueue_head(&request->rq_set_waitq);
633         request->rq_xid = ptlrpc_next_xid();
634         atomic_set(&request->rq_refcount, 1);
635
636         lustre_msg_set_opc(request->rq_reqmsg, opcode);
637
638         return 0;
639 out_ctx:
640         sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
641 out_free:
642         class_import_put(imp);
643         return rc;
644 }
645
646 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
647                              __u32 version, int opcode, char **bufs,
648                              struct ptlrpc_cli_ctx *ctx)
649 {
650         int count;
651
652         count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
653         return __ptlrpc_request_bufs_pack(request, version, opcode, count,
654                                           request->rq_pill.rc_area[RCL_CLIENT],
655                                           bufs, ctx);
656 }
657 EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
658
659 /**
660  * Pack request buffers for network transfer, performing necessary encryption
661  * steps if necessary.
662  */
663 int ptlrpc_request_pack(struct ptlrpc_request *request,
664                         __u32 version, int opcode)
665 {
666         int rc;
667         rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
668         if (rc)
669                 return rc;
670
671         /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
672          * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
673          * have to send old ptlrpc_body to keep interoperability with these
674          * clients.
675          *
676          * Only three kinds of server->client RPCs so far:
677          *  - LDLM_BL_CALLBACK
678          *  - LDLM_CP_CALLBACK
679          *  - LDLM_GL_CALLBACK
680          *
681          * XXX This should be removed whenever we drop the interoperability with
682          *     the these old clients.
683          */
684         if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
685             opcode == LDLM_GL_CALLBACK)
686                 req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
687                                    sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
688
689         return rc;
690 }
691 EXPORT_SYMBOL(ptlrpc_request_pack);
692
693 /**
694  * Helper function to allocate new request on import \a imp
695  * and possibly using existing request from pool \a pool if provided.
696  * Returns allocated request structure with import field filled or
697  * NULL on error.
698  */
699 static inline
700 struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
701                                               struct ptlrpc_request_pool *pool)
702 {
703         struct ptlrpc_request *request = NULL;
704
705         if (pool)
706                 request = ptlrpc_prep_req_from_pool(pool);
707
708         if (!request)
709                 request = ptlrpc_request_cache_alloc(GFP_NOFS);
710
711         if (request) {
712                 LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
713                 LASSERT(imp != LP_POISON);
714                 LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
715                         imp->imp_client);
716                 LASSERT(imp->imp_client != LP_POISON);
717
718                 request->rq_import = class_import_get(imp);
719         } else {
720                 CERROR("request allocation out of memory\n");
721         }
722
723         return request;
724 }
725
726 /**
727  * Helper function for creating a request.
728  * Calls __ptlrpc_request_alloc to allocate new request structure and inits
729  * buffer structures according to capsule template \a format.
730  * Returns allocated request structure pointer or NULL on error.
731  */
732 static struct ptlrpc_request *
733 ptlrpc_request_alloc_internal(struct obd_import *imp,
734                               struct ptlrpc_request_pool *pool,
735                               const struct req_format *format)
736 {
737         struct ptlrpc_request *request;
738
739         request = __ptlrpc_request_alloc(imp, pool);
740         if (request == NULL)
741                 return NULL;
742
743         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
744         req_capsule_set(&request->rq_pill, format);
745         return request;
746 }
747
748 /**
749  * Allocate new request structure for import \a imp and initialize its
750  * buffer structure according to capsule template \a format.
751  */
752 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
753                                             const struct req_format *format)
754 {
755         return ptlrpc_request_alloc_internal(imp, NULL, format);
756 }
757 EXPORT_SYMBOL(ptlrpc_request_alloc);
758
759 /**
760  * Allocate new request structure for import \a imp from pool \a pool and
761  * initialize its buffer structure according to capsule template \a format.
762  */
763 struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
764                                             struct ptlrpc_request_pool *pool,
765                                             const struct req_format *format)
766 {
767         return ptlrpc_request_alloc_internal(imp, pool, format);
768 }
769 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
770
771 /**
772  * For requests not from pool, free memory of the request structure.
773  * For requests obtained from a pool earlier, return request back to pool.
774  */
775 void ptlrpc_request_free(struct ptlrpc_request *request)
776 {
777         if (request->rq_pool)
778                 __ptlrpc_free_req_to_pool(request);
779         else
780                 ptlrpc_request_cache_free(request);
781 }
782 EXPORT_SYMBOL(ptlrpc_request_free);
783
784 /**
785  * Allocate new request for operation \a opcode and immediately pack it for
786  * network transfer.
787  * Only used for simple requests like OBD_PING where the only important
788  * part of the request is operation itself.
789  * Returns allocated request or NULL on error.
790  */
791 struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
792                                                 const struct req_format *format,
793                                                 __u32 version, int opcode)
794 {
795         struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
796         int                 rc;
797
798         if (req) {
799                 rc = ptlrpc_request_pack(req, version, opcode);
800                 if (rc) {
801                         ptlrpc_request_free(req);
802                         req = NULL;
803                 }
804         }
805         return req;
806 }
807 EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
808
809 /**
810  * Prepare request (fetched from pool \a pool if not NULL) on import \a imp
811  * for operation \a opcode. Request would contain \a count buffers.
812  * Sizes of buffers are described in array \a lengths and buffers themselves
813  * are provided by a pointer \a bufs.
814  * Returns prepared request structure pointer or NULL on error.
815  */
816 struct ptlrpc_request *
817 ptlrpc_prep_req_pool(struct obd_import *imp,
818                      __u32 version, int opcode,
819                      int count, __u32 *lengths, char **bufs,
820                      struct ptlrpc_request_pool *pool)
821 {
822         struct ptlrpc_request *request;
823         int                 rc;
824
825         request = __ptlrpc_request_alloc(imp, pool);
826         if (!request)
827                 return NULL;
828
829         rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
830                                         lengths, bufs, NULL);
831         if (rc) {
832                 ptlrpc_request_free(request);
833                 request = NULL;
834         }
835         return request;
836 }
837 EXPORT_SYMBOL(ptlrpc_prep_req_pool);
838
839 /**
840  * Same as ptlrpc_prep_req_pool, but without pool
841  */
842 struct ptlrpc_request *
843 ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
844                 __u32 *lengths, char **bufs)
845 {
846         return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
847                                     NULL);
848 }
849 EXPORT_SYMBOL(ptlrpc_prep_req);
850
851 /**
852  * Allocate and initialize new request set structure.
853  * Returns a pointer to the newly allocated set structure or NULL on error.
854  */
855 struct ptlrpc_request_set *ptlrpc_prep_set(void)
856 {
857         struct ptlrpc_request_set *set;
858
859         OBD_ALLOC(set, sizeof(*set));
860         if (!set)
861                 return NULL;
862         atomic_set(&set->set_refcount, 1);
863         INIT_LIST_HEAD(&set->set_requests);
864         init_waitqueue_head(&set->set_waitq);
865         atomic_set(&set->set_new_count, 0);
866         atomic_set(&set->set_remaining, 0);
867         spin_lock_init(&set->set_new_req_lock);
868         INIT_LIST_HEAD(&set->set_new_requests);
869         INIT_LIST_HEAD(&set->set_cblist);
870         set->set_max_inflight = UINT_MAX;
871         set->set_producer     = NULL;
872         set->set_producer_arg = NULL;
873         set->set_rc        = 0;
874
875         return set;
876 }
877 EXPORT_SYMBOL(ptlrpc_prep_set);
878
879 /**
880  * Allocate and initialize new request set structure with flow control
881  * extension. This extension allows to control the number of requests in-flight
882  * for the whole set. A callback function to generate requests must be provided
883  * and the request set will keep the number of requests sent over the wire to
884  * @max_inflight.
885  * Returns a pointer to the newly allocated set structure or NULL on error.
886  */
887 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
888                                              void *arg)
889
890 {
891         struct ptlrpc_request_set *set;
892
893         set = ptlrpc_prep_set();
894         if (!set)
895                 return NULL;
896
897         set->set_max_inflight  = max;
898         set->set_producer      = func;
899         set->set_producer_arg  = arg;
900
901         return set;
902 }
903 EXPORT_SYMBOL(ptlrpc_prep_fcset);
904
905 /**
906  * Wind down and free request set structure previously allocated with
907  * ptlrpc_prep_set.
908  * Ensures that all requests on the set have completed and removes
909  * all requests from the request list in a set.
910  * If any unsent request happen to be on the list, pretends that they got
911  * an error in flight and calls their completion handler.
912  */
913 void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
914 {
915         struct list_head       *tmp;
916         struct list_head       *next;
917         int            expected_phase;
918         int            n = 0;
919
920         /* Requests on the set should either all be completed, or all be new */
921         expected_phase = (atomic_read(&set->set_remaining) == 0) ?
922                          RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
923         list_for_each(tmp, &set->set_requests) {
924                 struct ptlrpc_request *req =
925                         list_entry(tmp, struct ptlrpc_request,
926                                        rq_set_chain);
927
928                 LASSERT(req->rq_phase == expected_phase);
929                 n++;
930         }
931
932         LASSERTF(atomic_read(&set->set_remaining) == 0 ||
933                  atomic_read(&set->set_remaining) == n, "%d / %d\n",
934                  atomic_read(&set->set_remaining), n);
935
936         list_for_each_safe(tmp, next, &set->set_requests) {
937                 struct ptlrpc_request *req =
938                         list_entry(tmp, struct ptlrpc_request,
939                                        rq_set_chain);
940                 list_del_init(&req->rq_set_chain);
941
942                 LASSERT(req->rq_phase == expected_phase);
943
944                 if (req->rq_phase == RQ_PHASE_NEW) {
945                         ptlrpc_req_interpret(NULL, req, -EBADR);
946                         atomic_dec(&set->set_remaining);
947                 }
948
949                 spin_lock(&req->rq_lock);
950                 req->rq_set = NULL;
951                 req->rq_invalid_rqset = 0;
952                 spin_unlock(&req->rq_lock);
953
954                 ptlrpc_req_finished(req);
955         }
956
957         LASSERT(atomic_read(&set->set_remaining) == 0);
958
959         ptlrpc_reqset_put(set);
960 }
961 EXPORT_SYMBOL(ptlrpc_set_destroy);
962
963 /**
964  * Add a callback function \a fn to the set.
965  * This function would be called when all requests on this set are completed.
966  * The function will be passed \a data argument.
967  */
968 int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
969                       set_interpreter_func fn, void *data)
970 {
971         struct ptlrpc_set_cbdata *cbdata;
972
973         OBD_ALLOC_PTR(cbdata);
974         if (cbdata == NULL)
975                 return -ENOMEM;
976
977         cbdata->psc_interpret = fn;
978         cbdata->psc_data = data;
979         list_add_tail(&cbdata->psc_item, &set->set_cblist);
980
981         return 0;
982 }
983 EXPORT_SYMBOL(ptlrpc_set_add_cb);
984
985 /**
986  * Add a new request to the general purpose request set.
987  * Assumes request reference from the caller.
988  */
989 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
990                         struct ptlrpc_request *req)
991 {
992         LASSERT(list_empty(&req->rq_set_chain));
993
994         /* The set takes over the caller's request reference */
995         list_add_tail(&req->rq_set_chain, &set->set_requests);
996         req->rq_set = set;
997         atomic_inc(&set->set_remaining);
998         req->rq_queued_time = cfs_time_current();
999
1000         if (req->rq_reqmsg != NULL)
1001                 lustre_msg_set_jobid(req->rq_reqmsg, NULL);
1002
1003         if (set->set_producer != NULL)
1004                 /* If the request set has a producer callback, the RPC must be
1005                  * sent straight away */
1006                 ptlrpc_send_new_req(req);
1007 }
1008 EXPORT_SYMBOL(ptlrpc_set_add_req);
1009
1010 /**
1011  * Add a request to a request with dedicated server thread
1012  * and wake the thread to make any necessary processing.
1013  * Currently only used for ptlrpcd.
1014  */
1015 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
1016                            struct ptlrpc_request *req)
1017 {
1018         struct ptlrpc_request_set *set = pc->pc_set;
1019         int count, i;
1020
1021         LASSERT(req->rq_set == NULL);
1022         LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
1023
1024         spin_lock(&set->set_new_req_lock);
1025         /*
1026          * The set takes over the caller's request reference.
1027          */
1028         req->rq_set = set;
1029         req->rq_queued_time = cfs_time_current();
1030         list_add_tail(&req->rq_set_chain, &set->set_new_requests);
1031         count = atomic_inc_return(&set->set_new_count);
1032         spin_unlock(&set->set_new_req_lock);
1033
1034         /* Only need to call wakeup once for the first entry. */
1035         if (count == 1) {
1036                 wake_up(&set->set_waitq);
1037
1038                 /* XXX: It maybe unnecessary to wakeup all the partners. But to
1039                  *      guarantee the async RPC can be processed ASAP, we have
1040                  *      no other better choice. It maybe fixed in future. */
1041                 for (i = 0; i < pc->pc_npartners; i++)
1042                         wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
1043         }
1044 }
1045 EXPORT_SYMBOL(ptlrpc_set_add_new_req);
1046
1047 /**
1048  * Based on the current state of the import, determine if the request
1049  * can be sent, is an error, or should be delayed.
1050  *
1051  * Returns true if this request should be delayed. If false, and
1052  * *status is set, then the request can not be sent and *status is the
1053  * error code.  If false and status is 0, then request can be sent.
1054  *
1055  * The imp->imp_lock must be held.
1056  */
1057 static int ptlrpc_import_delay_req(struct obd_import *imp,
1058                                    struct ptlrpc_request *req, int *status)
1059 {
1060         int delay = 0;
1061
1062         LASSERT(status != NULL);
1063         *status = 0;
1064
1065         if (req->rq_ctx_init || req->rq_ctx_fini) {
1066                 /* always allow ctx init/fini rpc go through */
1067         } else if (imp->imp_state == LUSTRE_IMP_NEW) {
1068                 DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
1069                 *status = -EIO;
1070         } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
1071                 /* pings may safely race with umount */
1072                 DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
1073                           D_HA : D_ERROR, req, "IMP_CLOSED ");
1074                 *status = -EIO;
1075         } else if (ptlrpc_send_limit_expired(req)) {
1076                 /* probably doesn't need to be a D_ERROR after initial testing */
1077                 DEBUG_REQ(D_ERROR, req, "send limit expired ");
1078                 *status = -EIO;
1079         } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
1080                    imp->imp_state == LUSTRE_IMP_CONNECTING) {
1081                 /* allow CONNECT even if import is invalid */
1082                 if (atomic_read(&imp->imp_inval_count) != 0) {
1083                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1084                         *status = -EIO;
1085                 }
1086         } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
1087                 if (!imp->imp_deactive)
1088                         DEBUG_REQ(D_NET, req, "IMP_INVALID");
1089                 *status = -ESHUTDOWN; /* bz 12940 */
1090         } else if (req->rq_import_generation != imp->imp_generation) {
1091                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
1092                 *status = -EIO;
1093         } else if (req->rq_send_state != imp->imp_state) {
1094                 /* invalidate in progress - any requests should be drop */
1095                 if (atomic_read(&imp->imp_inval_count) != 0) {
1096                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1097                         *status = -EIO;
1098                 } else if (imp->imp_dlm_fake || req->rq_no_delay) {
1099                         *status = -EWOULDBLOCK;
1100                 } else if (req->rq_allow_replay &&
1101                           (imp->imp_state == LUSTRE_IMP_REPLAY ||
1102                            imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
1103                            imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
1104                            imp->imp_state == LUSTRE_IMP_RECOVER)) {
1105                         DEBUG_REQ(D_HA, req, "allow during recovery.\n");
1106                 } else {
1107                         delay = 1;
1108                 }
1109         }
1110
1111         return delay;
1112 }
1113
1114 /**
1115  * Decide if the error message regarding provided request \a req
1116  * should be printed to the console or not.
1117  * Makes it's decision on request status and other properties.
1118  * Returns 1 to print error on the system console or 0 if not.
1119  */
1120 static int ptlrpc_console_allow(struct ptlrpc_request *req)
1121 {
1122         __u32 opc;
1123         int err;
1124
1125         LASSERT(req->rq_reqmsg != NULL);
1126         opc = lustre_msg_get_opc(req->rq_reqmsg);
1127
1128         /* Suppress particular reconnect errors which are to be expected.  No
1129          * errors are suppressed for the initial connection on an import */
1130         if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
1131             (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
1132
1133                 /* Suppress timed out reconnect requests */
1134                 if (req->rq_timedout)
1135                         return 0;
1136
1137                 /* Suppress unavailable/again reconnect requests */
1138                 err = lustre_msg_get_status(req->rq_repmsg);
1139                 if (err == -ENODEV || err == -EAGAIN)
1140                         return 0;
1141         }
1142
1143         return 1;
1144 }
1145
1146 /**
1147  * Check request processing status.
1148  * Returns the status.
1149  */
1150 static int ptlrpc_check_status(struct ptlrpc_request *req)
1151 {
1152         int err;
1153
1154         err = lustre_msg_get_status(req->rq_repmsg);
1155         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
1156                 struct obd_import *imp = req->rq_import;
1157                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
1158                 if (ptlrpc_console_allow(req))
1159                         LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n",
1160                                            imp->imp_obd->obd_name,
1161                                            libcfs_nid2str(
1162                                                    imp->imp_connection->c_peer.nid),
1163                                            ll_opcode2str(opc), err);
1164                 return err < 0 ? err : -EINVAL;
1165         }
1166
1167         if (err < 0) {
1168                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1169         } else if (err > 0) {
1170                 /* XXX: translate this error from net to host */
1171                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1172         }
1173
1174         return err;
1175 }
1176
1177 /**
1178  * save pre-versions of objects into request for replay.
1179  * Versions are obtained from server reply.
1180  * used for VBR.
1181  */
1182 static void ptlrpc_save_versions(struct ptlrpc_request *req)
1183 {
1184         struct lustre_msg *repmsg = req->rq_repmsg;
1185         struct lustre_msg *reqmsg = req->rq_reqmsg;
1186         __u64 *versions = lustre_msg_get_versions(repmsg);
1187
1188         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
1189                 return;
1190
1191         LASSERT(versions);
1192         lustre_msg_set_versions(reqmsg, versions);
1193         CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
1194                versions[0], versions[1]);
1195 }
1196
1197 /**
1198  * Callback function called when client receives RPC reply for \a req.
1199  * Returns 0 on success or error code.
1200  * The return value would be assigned to req->rq_status by the caller
1201  * as request processing status.
1202  * This function also decides if the request needs to be saved for later replay.
1203  */
1204 static int after_reply(struct ptlrpc_request *req)
1205 {
1206         struct obd_import *imp = req->rq_import;
1207         struct obd_device *obd = req->rq_import->imp_obd;
1208         int rc;
1209         struct timeval work_start;
1210         long timediff;
1211
1212         LASSERT(obd != NULL);
1213         /* repbuf must be unlinked */
1214         LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink);
1215
1216         if (req->rq_reply_truncate) {
1217                 if (ptlrpc_no_resend(req)) {
1218                         DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d",
1219                                   req->rq_nob_received, req->rq_repbuf_len);
1220                         return -EOVERFLOW;
1221                 }
1222
1223                 sptlrpc_cli_free_repbuf(req);
1224                 /* Pass the required reply buffer size (include
1225                  * space for early reply).
1226                  * NB: no need to roundup because alloc_repbuf
1227                  * will roundup it */
1228                 req->rq_replen       = req->rq_nob_received;
1229                 req->rq_nob_received = 0;
1230                 spin_lock(&req->rq_lock);
1231                 req->rq_resend       = 1;
1232                 spin_unlock(&req->rq_lock);
1233                 return 0;
1234         }
1235
1236         /*
1237          * NB Until this point, the whole of the incoming message,
1238          * including buflens, status etc is in the sender's byte order.
1239          */
1240         rc = sptlrpc_cli_unwrap_reply(req);
1241         if (rc) {
1242                 DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
1243                 return rc;
1244         }
1245
1246         /*
1247          * Security layer unwrap might ask resend this request.
1248          */
1249         if (req->rq_resend)
1250                 return 0;
1251
1252         rc = unpack_reply(req);
1253         if (rc)
1254                 return rc;
1255
1256         /* retry indefinitely on EINPROGRESS */
1257         if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
1258             ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
1259                 time_t  now = get_seconds();
1260
1261                 DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
1262                 spin_lock(&req->rq_lock);
1263                 req->rq_resend = 1;
1264                 spin_unlock(&req->rq_lock);
1265                 req->rq_nr_resend++;
1266
1267                 /* allocate new xid to avoid reply reconstruction */
1268                 if (!req->rq_bulk) {
1269                         /* new xid is already allocated for bulk in
1270                          * ptlrpc_check_set() */
1271                         req->rq_xid = ptlrpc_next_xid();
1272                         DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS");
1273                 }
1274
1275                 /* Readjust the timeout for current conditions */
1276                 ptlrpc_at_set_req_timeout(req);
1277                 /* delay resend to give a chance to the server to get ready.
1278                  * The delay is increased by 1s on every resend and is capped to
1279                  * the current request timeout (i.e. obd_timeout if AT is off,
1280                  * or AT service time x 125% + 5s, see at_est2timeout) */
1281                 if (req->rq_nr_resend > req->rq_timeout)
1282                         req->rq_sent = now + req->rq_timeout;
1283                 else
1284                         req->rq_sent = now + req->rq_nr_resend;
1285
1286                 return 0;
1287         }
1288
1289         do_gettimeofday(&work_start);
1290         timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
1291         if (obd->obd_svc_stats != NULL) {
1292                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
1293                                     timediff);
1294                 ptlrpc_lprocfs_rpc_sent(req, timediff);
1295         }
1296
1297         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
1298             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
1299                 DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
1300                           lustre_msg_get_type(req->rq_repmsg));
1301                 return -EPROTO;
1302         }
1303
1304         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
1305                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
1306         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
1307         ptlrpc_at_adj_net_latency(req,
1308                                   lustre_msg_get_service_time(req->rq_repmsg));
1309
1310         rc = ptlrpc_check_status(req);
1311         imp->imp_connect_error = rc;
1312
1313         if (rc) {
1314                 /*
1315                  * Either we've been evicted, or the server has failed for
1316                  * some reason. Try to reconnect, and if that fails, punt to
1317                  * the upcall.
1318                  */
1319                 if (ll_rpc_recoverable_error(rc)) {
1320                         if (req->rq_send_state != LUSTRE_IMP_FULL ||
1321                             imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
1322                                 return rc;
1323                         }
1324                         ptlrpc_request_handle_notconn(req);
1325                         return rc;
1326                 }
1327         } else {
1328                 /*
1329                  * Let's look if server sent slv. Do it only for RPC with
1330                  * rc == 0.
1331                  */
1332                 ldlm_cli_update_pool(req);
1333         }
1334
1335         /*
1336          * Store transno in reqmsg for replay.
1337          */
1338         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
1339                 req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
1340                 lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
1341         }
1342
1343         if (imp->imp_replayable) {
1344                 spin_lock(&imp->imp_lock);
1345                 /*
1346                  * No point in adding already-committed requests to the replay
1347                  * list, we will just remove them immediately. b=9829
1348                  */
1349                 if (req->rq_transno != 0 &&
1350                     (req->rq_transno >
1351                      lustre_msg_get_last_committed(req->rq_repmsg) ||
1352                      req->rq_replay)) {
1353                         /** version recovery */
1354                         ptlrpc_save_versions(req);
1355                         ptlrpc_retain_replayable_request(req, imp);
1356                 } else if (req->rq_commit_cb != NULL &&
1357                            list_empty(&req->rq_replay_list)) {
1358                         /* NB: don't call rq_commit_cb if it's already on
1359                          * rq_replay_list, ptlrpc_free_committed() will call
1360                          * it later, see LU-3618 for details */
1361                         spin_unlock(&imp->imp_lock);
1362                         req->rq_commit_cb(req);
1363                         spin_lock(&imp->imp_lock);
1364                 }
1365
1366                 /*
1367                  * Replay-enabled imports return commit-status information.
1368                  */
1369                 if (lustre_msg_get_last_committed(req->rq_repmsg)) {
1370                         imp->imp_peer_committed_transno =
1371                                 lustre_msg_get_last_committed(req->rq_repmsg);
1372                 }
1373
1374                 ptlrpc_free_committed(imp);
1375
1376                 if (!list_empty(&imp->imp_replay_list)) {
1377                         struct ptlrpc_request *last;
1378
1379                         last = list_entry(imp->imp_replay_list.prev,
1380                                               struct ptlrpc_request,
1381                                               rq_replay_list);
1382                         /*
1383                          * Requests with rq_replay stay on the list even if no
1384                          * commit is expected.
1385                          */
1386                         if (last->rq_transno > imp->imp_peer_committed_transno)
1387                                 ptlrpc_pinger_commit_expected(imp);
1388                 }
1389
1390                 spin_unlock(&imp->imp_lock);
1391         }
1392
1393         return rc;
1394 }
1395
1396 /**
1397  * Helper function to send request \a req over the network for the first time
1398  * Also adjusts request phase.
1399  * Returns 0 on success or error code.
1400  */
1401 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
1402 {
1403         struct obd_import     *imp = req->rq_import;
1404         int rc;
1405
1406         LASSERT(req->rq_phase == RQ_PHASE_NEW);
1407         if (req->rq_sent && (req->rq_sent > get_seconds()) &&
1408             (!req->rq_generation_set ||
1409              req->rq_import_generation == imp->imp_generation))
1410                 return 0;
1411
1412         ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
1413
1414         spin_lock(&imp->imp_lock);
1415
1416         if (!req->rq_generation_set)
1417                 req->rq_import_generation = imp->imp_generation;
1418
1419         if (ptlrpc_import_delay_req(imp, req, &rc)) {
1420                 spin_lock(&req->rq_lock);
1421                 req->rq_waiting = 1;
1422                 spin_unlock(&req->rq_lock);
1423
1424                 DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)",
1425                           lustre_msg_get_status(req->rq_reqmsg),
1426                           ptlrpc_import_state_name(req->rq_send_state),
1427                           ptlrpc_import_state_name(imp->imp_state));
1428                 LASSERT(list_empty(&req->rq_list));
1429                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
1430                 atomic_inc(&req->rq_import->imp_inflight);
1431                 spin_unlock(&imp->imp_lock);
1432                 return 0;
1433         }
1434
1435         if (rc != 0) {
1436                 spin_unlock(&imp->imp_lock);
1437                 req->rq_status = rc;
1438                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1439                 return rc;
1440         }
1441
1442         LASSERT(list_empty(&req->rq_list));
1443         list_add_tail(&req->rq_list, &imp->imp_sending_list);
1444         atomic_inc(&req->rq_import->imp_inflight);
1445         spin_unlock(&imp->imp_lock);
1446
1447         lustre_msg_set_status(req->rq_reqmsg, current_pid());
1448
1449         rc = sptlrpc_req_refresh_ctx(req, -1);
1450         if (rc) {
1451                 if (req->rq_err) {
1452                         req->rq_status = rc;
1453                         return 1;
1454                 }
1455                 spin_lock(&req->rq_lock);
1456                 req->rq_wait_ctx = 1;
1457                 spin_unlock(&req->rq_lock);
1458                 return 0;
1459         }
1460
1461         CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1462                current_comm(),
1463                imp->imp_obd->obd_uuid.uuid,
1464                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1465                libcfs_nid2str(imp->imp_connection->c_peer.nid),
1466                lustre_msg_get_opc(req->rq_reqmsg));
1467
1468         rc = ptl_send_rpc(req, 0);
1469         if (rc) {
1470                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
1471                 spin_lock(&req->rq_lock);
1472                 req->rq_net_err = 1;
1473                 spin_unlock(&req->rq_lock);
1474                 return rc;
1475         }
1476         return 0;
1477 }
1478
1479 static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
1480 {
1481         int remaining, rc;
1482
1483         LASSERT(set->set_producer != NULL);
1484
1485         remaining = atomic_read(&set->set_remaining);
1486
1487         /* populate the ->set_requests list with requests until we
1488          * reach the maximum number of RPCs in flight for this set */
1489         while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
1490                 rc = set->set_producer(set, set->set_producer_arg);
1491                 if (rc == -ENOENT) {
1492                         /* no more RPC to produce */
1493                         set->set_producer     = NULL;
1494                         set->set_producer_arg = NULL;
1495                         return 0;
1496                 }
1497         }
1498
1499         return (atomic_read(&set->set_remaining) - remaining);
1500 }
1501
1502 /**
1503  * this sends any unsent RPCs in \a set and returns 1 if all are sent
1504  * and no more replies are expected.
1505  * (it is possible to get less replies than requests sent e.g. due to timed out
1506  * requests or requests that we had trouble to send out)
1507  *
1508  * NOTE: This function contains a potential schedule point (cond_resched()).
1509  */
1510 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
1511 {
1512         struct list_head *tmp, *next;
1513         struct list_head comp_reqs;
1514         int force_timer_recalc = 0;
1515
1516         if (atomic_read(&set->set_remaining) == 0)
1517                 return 1;
1518
1519         INIT_LIST_HEAD(&comp_reqs);
1520         list_for_each_safe(tmp, next, &set->set_requests) {
1521                 struct ptlrpc_request *req =
1522                         list_entry(tmp, struct ptlrpc_request,
1523                                        rq_set_chain);
1524                 struct obd_import *imp = req->rq_import;
1525                 int unregistered = 0;
1526                 int rc = 0;
1527
1528                 /* This schedule point is mainly for the ptlrpcd caller of this
1529                  * function.  Most ptlrpc sets are not long-lived and unbounded
1530                  * in length, but at the least the set used by the ptlrpcd is.
1531                  * Since the processing time is unbounded, we need to insert an
1532                  * explicit schedule point to make the thread well-behaved.
1533                  */
1534                 cond_resched();
1535
1536                 if (req->rq_phase == RQ_PHASE_NEW &&
1537                     ptlrpc_send_new_req(req)) {
1538                         force_timer_recalc = 1;
1539                 }
1540
1541                 /* delayed send - skip */
1542                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
1543                         continue;
1544
1545                 /* delayed resend - skip */
1546                 if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
1547                     req->rq_sent > get_seconds())
1548                         continue;
1549
1550                 if (!(req->rq_phase == RQ_PHASE_RPC ||
1551                       req->rq_phase == RQ_PHASE_BULK ||
1552                       req->rq_phase == RQ_PHASE_INTERPRET ||
1553                       req->rq_phase == RQ_PHASE_UNREGISTERING ||
1554                       req->rq_phase == RQ_PHASE_COMPLETE)) {
1555                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
1556                         LBUG();
1557                 }
1558
1559                 if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
1560                         LASSERT(req->rq_next_phase != req->rq_phase);
1561                         LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
1562
1563                         /*
1564                          * Skip processing until reply is unlinked. We
1565                          * can't return to pool before that and we can't
1566                          * call interpret before that. We need to make
1567                          * sure that all rdma transfers finished and will
1568                          * not corrupt any data.
1569                          */
1570                         if (ptlrpc_client_recv_or_unlink(req) ||
1571                             ptlrpc_client_bulk_active(req))
1572                                 continue;
1573
1574                         /*
1575                          * Turn fail_loc off to prevent it from looping
1576                          * forever.
1577                          */
1578                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
1579                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
1580                                                      OBD_FAIL_ONCE);
1581                         }
1582                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
1583                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
1584                                                      OBD_FAIL_ONCE);
1585                         }
1586
1587                         /*
1588                          * Move to next phase if reply was successfully
1589                          * unlinked.
1590                          */
1591                         ptlrpc_rqphase_move(req, req->rq_next_phase);
1592                 }
1593
1594                 if (req->rq_phase == RQ_PHASE_COMPLETE) {
1595                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1596                         continue;
1597                 }
1598
1599                 if (req->rq_phase == RQ_PHASE_INTERPRET)
1600                         goto interpret;
1601
1602                 /*
1603                  * Note that this also will start async reply unlink.
1604                  */
1605                 if (req->rq_net_err && !req->rq_timedout) {
1606                         ptlrpc_expire_one_request(req, 1);
1607
1608                         /*
1609                          * Check if we still need to wait for unlink.
1610                          */
1611                         if (ptlrpc_client_recv_or_unlink(req) ||
1612                             ptlrpc_client_bulk_active(req))
1613                                 continue;
1614                         /* If there is no need to resend, fail it now. */
1615                         if (req->rq_no_resend) {
1616                                 if (req->rq_status == 0)
1617                                         req->rq_status = -EIO;
1618                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1619                                 goto interpret;
1620                         } else {
1621                                 continue;
1622                         }
1623                 }
1624
1625                 if (req->rq_err) {
1626                         spin_lock(&req->rq_lock);
1627                         req->rq_replied = 0;
1628                         spin_unlock(&req->rq_lock);
1629                         if (req->rq_status == 0)
1630                                 req->rq_status = -EIO;
1631                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1632                         goto interpret;
1633                 }
1634
1635                 /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
1636                  * so it sets rq_intr regardless of individual rpc
1637                  * timeouts. The synchronous IO waiting path sets
1638                  * rq_intr irrespective of whether ptlrpcd
1639                  * has seen a timeout.  Our policy is to only interpret
1640                  * interrupted rpcs after they have timed out, so we
1641                  * need to enforce that here.
1642                  */
1643
1644                 if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
1645                                      req->rq_wait_ctx)) {
1646                         req->rq_status = -EINTR;
1647                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1648                         goto interpret;
1649                 }
1650
1651                 if (req->rq_phase == RQ_PHASE_RPC) {
1652                         if (req->rq_timedout || req->rq_resend ||
1653                             req->rq_waiting || req->rq_wait_ctx) {
1654                                 int status;
1655
1656                                 if (!ptlrpc_unregister_reply(req, 1))
1657                                         continue;
1658
1659                                 spin_lock(&imp->imp_lock);
1660                                 if (ptlrpc_import_delay_req(imp, req,
1661                                                             &status)) {
1662                                         /* put on delay list - only if we wait
1663                                          * recovery finished - before send */
1664                                         list_del_init(&req->rq_list);
1665                                         list_add_tail(&req->rq_list,
1666                                                           &imp->
1667                                                           imp_delayed_list);
1668                                         spin_unlock(&imp->imp_lock);
1669                                         continue;
1670                                 }
1671
1672                                 if (status != 0)  {
1673                                         req->rq_status = status;
1674                                         ptlrpc_rqphase_move(req,
1675                                                 RQ_PHASE_INTERPRET);
1676                                         spin_unlock(&imp->imp_lock);
1677                                         goto interpret;
1678                                 }
1679                                 if (ptlrpc_no_resend(req) &&
1680                                     !req->rq_wait_ctx) {
1681                                         req->rq_status = -ENOTCONN;
1682                                         ptlrpc_rqphase_move(req,
1683                                                             RQ_PHASE_INTERPRET);
1684                                         spin_unlock(&imp->imp_lock);
1685                                         goto interpret;
1686                                 }
1687
1688                                 list_del_init(&req->rq_list);
1689                                 list_add_tail(&req->rq_list,
1690                                                   &imp->imp_sending_list);
1691
1692                                 spin_unlock(&imp->imp_lock);
1693
1694                                 spin_lock(&req->rq_lock);
1695                                 req->rq_waiting = 0;
1696                                 spin_unlock(&req->rq_lock);
1697
1698                                 if (req->rq_timedout || req->rq_resend) {
1699                                         /* This is re-sending anyways,
1700                                          * let's mark req as resend. */
1701                                         spin_lock(&req->rq_lock);
1702                                         req->rq_resend = 1;
1703                                         spin_unlock(&req->rq_lock);
1704                                         if (req->rq_bulk) {
1705                                                 __u64 old_xid;
1706
1707                                                 if (!ptlrpc_unregister_bulk(req, 1))
1708                                                         continue;
1709
1710                                                 /* ensure previous bulk fails */
1711                                                 old_xid = req->rq_xid;
1712                                                 req->rq_xid = ptlrpc_next_xid();
1713                                                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
1714                                                        old_xid, req->rq_xid);
1715                                         }
1716                                 }
1717                                 /*
1718                                  * rq_wait_ctx is only touched by ptlrpcd,
1719                                  * so no lock is needed here.
1720                                  */
1721                                 status = sptlrpc_req_refresh_ctx(req, -1);
1722                                 if (status) {
1723                                         if (req->rq_err) {
1724                                                 req->rq_status = status;
1725                                                 spin_lock(&req->rq_lock);
1726                                                 req->rq_wait_ctx = 0;
1727                                                 spin_unlock(&req->rq_lock);
1728                                                 force_timer_recalc = 1;
1729                                         } else {
1730                                                 spin_lock(&req->rq_lock);
1731                                                 req->rq_wait_ctx = 1;
1732                                                 spin_unlock(&req->rq_lock);
1733                                         }
1734
1735                                         continue;
1736                                 } else {
1737                                         spin_lock(&req->rq_lock);
1738                                         req->rq_wait_ctx = 0;
1739                                         spin_unlock(&req->rq_lock);
1740                                 }
1741
1742                                 rc = ptl_send_rpc(req, 0);
1743                                 if (rc) {
1744                                         DEBUG_REQ(D_HA, req,
1745                                                   "send failed: rc = %d", rc);
1746                                         force_timer_recalc = 1;
1747                                         spin_lock(&req->rq_lock);
1748                                         req->rq_net_err = 1;
1749                                         spin_unlock(&req->rq_lock);
1750                                         continue;
1751                                 }
1752                                 /* need to reset the timeout */
1753                                 force_timer_recalc = 1;
1754                         }
1755
1756                         spin_lock(&req->rq_lock);
1757
1758                         if (ptlrpc_client_early(req)) {
1759                                 ptlrpc_at_recv_early_reply(req);
1760                                 spin_unlock(&req->rq_lock);
1761                                 continue;
1762                         }
1763
1764                         /* Still waiting for a reply? */
1765                         if (ptlrpc_client_recv(req)) {
1766                                 spin_unlock(&req->rq_lock);
1767                                 continue;
1768                         }
1769
1770                         /* Did we actually receive a reply? */
1771                         if (!ptlrpc_client_replied(req)) {
1772                                 spin_unlock(&req->rq_lock);
1773                                 continue;
1774                         }
1775
1776                         spin_unlock(&req->rq_lock);
1777
1778                         /* unlink from net because we are going to
1779                          * swab in-place of reply buffer */
1780                         unregistered = ptlrpc_unregister_reply(req, 1);
1781                         if (!unregistered)
1782                                 continue;
1783
1784                         req->rq_status = after_reply(req);
1785                         if (req->rq_resend)
1786                                 continue;
1787
1788                         /* If there is no bulk associated with this request,
1789                          * then we're done and should let the interpreter
1790                          * process the reply. Similarly if the RPC returned
1791                          * an error, and therefore the bulk will never arrive.
1792                          */
1793                         if (req->rq_bulk == NULL || req->rq_status < 0) {
1794                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1795                                 goto interpret;
1796                         }
1797
1798                         ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
1799                 }
1800
1801                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
1802                 if (ptlrpc_client_bulk_active(req))
1803                         continue;
1804
1805                 if (req->rq_bulk->bd_failure) {
1806                         /* The RPC reply arrived OK, but the bulk screwed
1807                          * up!  Dead weird since the server told us the RPC
1808                          * was good after getting the REPLY for her GET or
1809                          * the ACK for her PUT. */
1810                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
1811                         req->rq_status = -EIO;
1812                 }
1813
1814                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1815
1816 interpret:
1817                 LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
1818
1819                 /* This moves to "unregistering" phase we need to wait for
1820                  * reply unlink. */
1821                 if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
1822                         /* start async bulk unlink too */
1823                         ptlrpc_unregister_bulk(req, 1);
1824                         continue;
1825                 }
1826
1827                 if (!ptlrpc_unregister_bulk(req, 1))
1828                         continue;
1829
1830                 /* When calling interpret receiving already should be
1831                  * finished. */
1832                 LASSERT(!req->rq_receiving_reply);
1833
1834                 ptlrpc_req_interpret(env, req, req->rq_status);
1835
1836                 if (ptlrpcd_check_work(req)) {
1837                         atomic_dec(&set->set_remaining);
1838                         continue;
1839                 }
1840                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
1841
1842                 CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
1843                        "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1844                        current_comm(), imp->imp_obd->obd_uuid.uuid,
1845                        lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1846                        libcfs_nid2str(imp->imp_connection->c_peer.nid),
1847                        lustre_msg_get_opc(req->rq_reqmsg));
1848
1849                 spin_lock(&imp->imp_lock);
1850                 /* Request already may be not on sending or delaying list. This
1851                  * may happen in the case of marking it erroneous for the case
1852                  * ptlrpc_import_delay_req(req, status) find it impossible to
1853                  * allow sending this rpc and returns *status != 0. */
1854                 if (!list_empty(&req->rq_list)) {
1855                         list_del_init(&req->rq_list);
1856                         atomic_dec(&imp->imp_inflight);
1857                 }
1858                 spin_unlock(&imp->imp_lock);
1859
1860                 atomic_dec(&set->set_remaining);
1861                 wake_up_all(&imp->imp_recovery_waitq);
1862
1863                 if (set->set_producer) {
1864                         /* produce a new request if possible */
1865                         if (ptlrpc_set_producer(set) > 0)
1866                                 force_timer_recalc = 1;
1867
1868                         /* free the request that has just been completed
1869                          * in order not to pollute set->set_requests */
1870                         list_del_init(&req->rq_set_chain);
1871                         spin_lock(&req->rq_lock);
1872                         req->rq_set = NULL;
1873                         req->rq_invalid_rqset = 0;
1874                         spin_unlock(&req->rq_lock);
1875
1876                         /* record rq_status to compute the final status later */
1877                         if (req->rq_status != 0)
1878                                 set->set_rc = req->rq_status;
1879                         ptlrpc_req_finished(req);
1880                 } else {
1881                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1882                 }
1883         }
1884
1885         /* move completed request at the head of list so it's easier for
1886          * caller to find them */
1887         list_splice(&comp_reqs, &set->set_requests);
1888
1889         /* If we hit an error, we want to recover promptly. */
1890         return atomic_read(&set->set_remaining) == 0 || force_timer_recalc;
1891 }
1892 EXPORT_SYMBOL(ptlrpc_check_set);
1893
1894 /**
1895  * Time out request \a req. is \a async_unlink is set, that means do not wait
1896  * until LNet actually confirms network buffer unlinking.
1897  * Return 1 if we should give up further retrying attempts or 0 otherwise.
1898  */
1899 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
1900 {
1901         struct obd_import *imp = req->rq_import;
1902         int rc = 0;
1903
1904         spin_lock(&req->rq_lock);
1905         req->rq_timedout = 1;
1906         spin_unlock(&req->rq_lock);
1907
1908         DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
1909                   "/real "CFS_DURATION_T"]",
1910                   req->rq_net_err ? "failed due to network error" :
1911                      ((req->rq_real_sent == 0 ||
1912                        time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) ||
1913                        cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
1914                       "timed out for sent delay" : "timed out for slow reply"),
1915                   req->rq_sent, req->rq_real_sent);
1916
1917         if (imp != NULL && obd_debug_peer_on_timeout)
1918                 LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
1919
1920         ptlrpc_unregister_reply(req, async_unlink);
1921         ptlrpc_unregister_bulk(req, async_unlink);
1922
1923         if (obd_dump_on_timeout)
1924                 libcfs_debug_dumplog();
1925
1926         if (imp == NULL) {
1927                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
1928                 return 1;
1929         }
1930
1931         atomic_inc(&imp->imp_timeouts);
1932
1933         /* The DLM server doesn't want recovery run on its imports. */
1934         if (imp->imp_dlm_fake)
1935                 return 1;
1936
1937         /* If this request is for recovery or other primordial tasks,
1938          * then error it out here. */
1939         if (req->rq_ctx_init || req->rq_ctx_fini ||
1940             req->rq_send_state != LUSTRE_IMP_FULL ||
1941             imp->imp_obd->obd_no_recov) {
1942                 DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
1943                           ptlrpc_import_state_name(req->rq_send_state),
1944                           ptlrpc_import_state_name(imp->imp_state));
1945                 spin_lock(&req->rq_lock);
1946                 req->rq_status = -ETIMEDOUT;
1947                 req->rq_err = 1;
1948                 spin_unlock(&req->rq_lock);
1949                 return 1;
1950         }
1951
1952         /* if a request can't be resent we can't wait for an answer after
1953            the timeout */
1954         if (ptlrpc_no_resend(req)) {
1955                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
1956                 rc = 1;
1957         }
1958
1959         ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
1960
1961         return rc;
1962 }
1963
1964 /**
1965  * Time out all uncompleted requests in request set pointed by \a data
1966  * Callback used when waiting on sets with l_wait_event.
1967  * Always returns 1.
1968  */
1969 int ptlrpc_expired_set(void *data)
1970 {
1971         struct ptlrpc_request_set *set = data;
1972         struct list_head                *tmp;
1973         time_t               now = get_seconds();
1974
1975         LASSERT(set != NULL);
1976
1977         /*
1978          * A timeout expired. See which reqs it applies to...
1979          */
1980         list_for_each(tmp, &set->set_requests) {
1981                 struct ptlrpc_request *req =
1982                         list_entry(tmp, struct ptlrpc_request,
1983                                        rq_set_chain);
1984
1985                 /* don't expire request waiting for context */
1986                 if (req->rq_wait_ctx)
1987                         continue;
1988
1989                 /* Request in-flight? */
1990                 if (!((req->rq_phase == RQ_PHASE_RPC &&
1991                        !req->rq_waiting && !req->rq_resend) ||
1992                       (req->rq_phase == RQ_PHASE_BULK)))
1993                         continue;
1994
1995                 if (req->rq_timedout ||     /* already dealt with */
1996                     req->rq_deadline > now) /* not expired */
1997                         continue;
1998
1999                 /* Deal with this guy. Do it asynchronously to not block
2000                  * ptlrpcd thread. */
2001                 ptlrpc_expire_one_request(req, 1);
2002         }
2003
2004         /*
2005          * When waiting for a whole set, we always break out of the
2006          * sleep so we can recalculate the timeout, or enable interrupts
2007          * if everyone's timed out.
2008          */
2009         return 1;
2010 }
2011 EXPORT_SYMBOL(ptlrpc_expired_set);
2012
2013 /**
2014  * Sets rq_intr flag in \a req under spinlock.
2015  */
2016 void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
2017 {
2018         spin_lock(&req->rq_lock);
2019         req->rq_intr = 1;
2020         spin_unlock(&req->rq_lock);
2021 }
2022 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
2023
2024 /**
2025  * Interrupts (sets interrupted flag) all uncompleted requests in
2026  * a set \a data. Callback for l_wait_event for interruptible waits.
2027  */
2028 void ptlrpc_interrupted_set(void *data)
2029 {
2030         struct ptlrpc_request_set *set = data;
2031         struct list_head *tmp;
2032
2033         LASSERT(set != NULL);
2034         CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
2035
2036         list_for_each(tmp, &set->set_requests) {
2037                 struct ptlrpc_request *req =
2038                         list_entry(tmp, struct ptlrpc_request,
2039                                        rq_set_chain);
2040
2041                 if (req->rq_phase != RQ_PHASE_RPC &&
2042                     req->rq_phase != RQ_PHASE_UNREGISTERING)
2043                         continue;
2044
2045                 ptlrpc_mark_interrupted(req);
2046         }
2047 }
2048 EXPORT_SYMBOL(ptlrpc_interrupted_set);
2049
2050 /**
2051  * Get the smallest timeout in the set; this does NOT set a timeout.
2052  */
2053 int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
2054 {
2055         struct list_head            *tmp;
2056         time_t           now = get_seconds();
2057         int                 timeout = 0;
2058         struct ptlrpc_request *req;
2059         int                 deadline;
2060
2061         list_for_each(tmp, &set->set_requests) {
2062                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2063
2064                 /*
2065                  * Request in-flight?
2066                  */
2067                 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
2068                       (req->rq_phase == RQ_PHASE_BULK) ||
2069                       (req->rq_phase == RQ_PHASE_NEW)))
2070                         continue;
2071
2072                 /*
2073                  * Already timed out.
2074                  */
2075                 if (req->rq_timedout)
2076                         continue;
2077
2078                 /*
2079                  * Waiting for ctx.
2080                  */
2081                 if (req->rq_wait_ctx)
2082                         continue;
2083
2084                 if (req->rq_phase == RQ_PHASE_NEW)
2085                         deadline = req->rq_sent;
2086                 else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
2087                         deadline = req->rq_sent;
2088                 else
2089                         deadline = req->rq_sent + req->rq_timeout;
2090
2091                 if (deadline <= now)    /* actually expired already */
2092                         timeout = 1;    /* ASAP */
2093                 else if (timeout == 0 || timeout > deadline - now)
2094                         timeout = deadline - now;
2095         }
2096         return timeout;
2097 }
2098 EXPORT_SYMBOL(ptlrpc_set_next_timeout);
2099
2100 /**
2101  * Send all unset request from the set and then wait until all
2102  * requests in the set complete (either get a reply, timeout, get an
2103  * error or otherwise be interrupted).
2104  * Returns 0 on success or error code otherwise.
2105  */
2106 int ptlrpc_set_wait(struct ptlrpc_request_set *set)
2107 {
2108         struct list_head            *tmp;
2109         struct ptlrpc_request *req;
2110         struct l_wait_info     lwi;
2111         int                 rc, timeout;
2112
2113         if (set->set_producer)
2114                 (void)ptlrpc_set_producer(set);
2115         else
2116                 list_for_each(tmp, &set->set_requests) {
2117                         req = list_entry(tmp, struct ptlrpc_request,
2118                                              rq_set_chain);
2119                         if (req->rq_phase == RQ_PHASE_NEW)
2120                                 (void)ptlrpc_send_new_req(req);
2121                 }
2122
2123         if (list_empty(&set->set_requests))
2124                 return 0;
2125
2126         do {
2127                 timeout = ptlrpc_set_next_timeout(set);
2128
2129                 /* wait until all complete, interrupted, or an in-flight
2130                  * req times out */
2131                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
2132                        set, timeout);
2133
2134                 if (timeout == 0 && !cfs_signal_pending())
2135                         /*
2136                          * No requests are in-flight (ether timed out
2137                          * or delayed), so we can allow interrupts.
2138                          * We still want to block for a limited time,
2139                          * so we allow interrupts during the timeout.
2140                          */
2141                         lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
2142                                                    ptlrpc_expired_set,
2143                                                    ptlrpc_interrupted_set, set);
2144                 else
2145                         /*
2146                          * At least one request is in flight, so no
2147                          * interrupts are allowed. Wait until all
2148                          * complete, or an in-flight req times out.
2149                          */
2150                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
2151                                           ptlrpc_expired_set, set);
2152
2153                 rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
2154
2155                 /* LU-769 - if we ignored the signal because it was already
2156                  * pending when we started, we need to handle it now or we risk
2157                  * it being ignored forever */
2158                 if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
2159                     cfs_signal_pending()) {
2160                         sigset_t blocked_sigs =
2161                                            cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
2162
2163                         /* In fact we only interrupt for the "fatal" signals
2164                          * like SIGINT or SIGKILL. We still ignore less
2165                          * important signals since ptlrpc set is not easily
2166                          * reentrant from userspace again */
2167                         if (cfs_signal_pending())
2168                                 ptlrpc_interrupted_set(set);
2169                         cfs_restore_sigs(blocked_sigs);
2170                 }
2171
2172                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
2173
2174                 /* -EINTR => all requests have been flagged rq_intr so next
2175                  * check completes.
2176                  * -ETIMEDOUT => someone timed out.  When all reqs have
2177                  * timed out, signals are enabled allowing completion with
2178                  * EINTR.
2179                  * I don't really care if we go once more round the loop in
2180                  * the error cases -eeb. */
2181                 if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
2182                         list_for_each(tmp, &set->set_requests) {
2183                                 req = list_entry(tmp, struct ptlrpc_request,
2184                                                      rq_set_chain);
2185                                 spin_lock(&req->rq_lock);
2186                                 req->rq_invalid_rqset = 1;
2187                                 spin_unlock(&req->rq_lock);
2188                         }
2189                 }
2190         } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
2191
2192         LASSERT(atomic_read(&set->set_remaining) == 0);
2193
2194         rc = set->set_rc; /* rq_status of already freed requests if any */
2195         list_for_each(tmp, &set->set_requests) {
2196                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2197
2198                 LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
2199                 if (req->rq_status != 0)
2200                         rc = req->rq_status;
2201         }
2202
2203         if (set->set_interpret != NULL) {
2204                 int (*interpreter)(struct ptlrpc_request_set *set, void *, int) =
2205                         set->set_interpret;
2206                 rc = interpreter(set, set->set_arg, rc);
2207         } else {
2208                 struct ptlrpc_set_cbdata *cbdata, *n;
2209                 int err;
2210
2211                 list_for_each_entry_safe(cbdata, n,
2212                                          &set->set_cblist, psc_item) {
2213                         list_del_init(&cbdata->psc_item);
2214                         err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
2215                         if (err && !rc)
2216                                 rc = err;
2217                         OBD_FREE_PTR(cbdata);
2218                 }
2219         }
2220
2221         return rc;
2222 }
2223 EXPORT_SYMBOL(ptlrpc_set_wait);
2224
2225 /**
2226  * Helper function for request freeing.
2227  * Called when request count reached zero and request needs to be freed.
2228  * Removes request from all sorts of sending/replay lists it might be on,
2229  * frees network buffers if any are present.
2230  * If \a locked is set, that means caller is already holding import imp_lock
2231  * and so we no longer need to reobtain it (for certain lists manipulations)
2232  */
2233 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
2234 {
2235         if (request == NULL)
2236                 return;
2237         LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
2238         LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */
2239         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
2240         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
2241         LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
2242         LASSERTF(!request->rq_replay, "req %p\n", request);
2243
2244         req_capsule_fini(&request->rq_pill);
2245
2246         /* We must take it off the imp_replay_list first.  Otherwise, we'll set
2247          * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
2248         if (request->rq_import != NULL) {
2249                 if (!locked)
2250                         spin_lock(&request->rq_import->imp_lock);
2251                 list_del_init(&request->rq_replay_list);
2252                 if (!locked)
2253                         spin_unlock(&request->rq_import->imp_lock);
2254         }
2255         LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
2256
2257         if (atomic_read(&request->rq_refcount) != 0) {
2258                 DEBUG_REQ(D_ERROR, request,
2259                           "freeing request with nonzero refcount");
2260                 LBUG();
2261         }
2262
2263         if (request->rq_repbuf != NULL)
2264                 sptlrpc_cli_free_repbuf(request);
2265         if (request->rq_export != NULL) {
2266                 class_export_put(request->rq_export);
2267                 request->rq_export = NULL;
2268         }
2269         if (request->rq_import != NULL) {
2270                 class_import_put(request->rq_import);
2271                 request->rq_import = NULL;
2272         }
2273         if (request->rq_bulk != NULL)
2274                 ptlrpc_free_bulk_pin(request->rq_bulk);
2275
2276         if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
2277                 sptlrpc_cli_free_reqbuf(request);
2278
2279         if (request->rq_cli_ctx)
2280                 sptlrpc_req_put_ctx(request, !locked);
2281
2282         if (request->rq_pool)
2283                 __ptlrpc_free_req_to_pool(request);
2284         else
2285                 ptlrpc_request_cache_free(request);
2286 }
2287
2288 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
2289 /**
2290  * Drop one request reference. Must be called with import imp_lock held.
2291  * When reference count drops to zero, request is freed.
2292  */
2293 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
2294 {
2295         assert_spin_locked(&request->rq_import->imp_lock);
2296         (void)__ptlrpc_req_finished(request, 1);
2297 }
2298 EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
2299
2300 /**
2301  * Helper function
2302  * Drops one reference count for request \a request.
2303  * \a locked set indicates that caller holds import imp_lock.
2304  * Frees the request when reference count reaches zero.
2305  */
2306 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
2307 {
2308         if (request == NULL)
2309                 return 1;
2310
2311         if (request == LP_POISON ||
2312             request->rq_reqmsg == LP_POISON) {
2313                 CERROR("dereferencing freed request (bug 575)\n");
2314                 LBUG();
2315                 return 1;
2316         }
2317
2318         DEBUG_REQ(D_INFO, request, "refcount now %u",
2319                   atomic_read(&request->rq_refcount) - 1);
2320
2321         if (atomic_dec_and_test(&request->rq_refcount)) {
2322                 __ptlrpc_free_req(request, locked);
2323                 return 1;
2324         }
2325
2326         return 0;
2327 }
2328
2329 /**
2330  * Drops one reference count for a request.
2331  */
2332 void ptlrpc_req_finished(struct ptlrpc_request *request)
2333 {
2334         __ptlrpc_req_finished(request, 0);
2335 }
2336 EXPORT_SYMBOL(ptlrpc_req_finished);
2337
2338 /**
2339  * Returns xid of a \a request
2340  */
2341 __u64 ptlrpc_req_xid(struct ptlrpc_request *request)
2342 {
2343         return request->rq_xid;
2344 }
2345 EXPORT_SYMBOL(ptlrpc_req_xid);
2346
2347 /**
2348  * Disengage the client's reply buffer from the network
2349  * NB does _NOT_ unregister any client-side bulk.
2350  * IDEMPOTENT, but _not_ safe against concurrent callers.
2351  * The request owner (i.e. the thread doing the I/O) must call...
2352  * Returns 0 on success or 1 if unregistering cannot be made.
2353  */
2354 int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
2355 {
2356         int             rc;
2357         wait_queue_head_t       *wq;
2358         struct l_wait_info lwi;
2359
2360         /*
2361          * Might sleep.
2362          */
2363         LASSERT(!in_interrupt());
2364
2365         /*
2366          * Let's setup deadline for reply unlink.
2367          */
2368         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
2369             async && request->rq_reply_deadline == 0)
2370                 request->rq_reply_deadline = get_seconds()+LONG_UNLINK;
2371
2372         /*
2373          * Nothing left to do.
2374          */
2375         if (!ptlrpc_client_recv_or_unlink(request))
2376                 return 1;
2377
2378         LNetMDUnlink(request->rq_reply_md_h);
2379
2380         /*
2381          * Let's check it once again.
2382          */
2383         if (!ptlrpc_client_recv_or_unlink(request))
2384                 return 1;
2385
2386         /*
2387          * Move to "Unregistering" phase as reply was not unlinked yet.
2388          */
2389         ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
2390
2391         /*
2392          * Do not wait for unlink to finish.
2393          */
2394         if (async)
2395                 return 0;
2396
2397         /*
2398          * We have to l_wait_event() whatever the result, to give liblustre
2399          * a chance to run reply_in_callback(), and to make sure we've
2400          * unlinked before returning a req to the pool.
2401          */
2402         if (request->rq_set != NULL)
2403                 wq = &request->rq_set->set_waitq;
2404         else
2405                 wq = &request->rq_reply_waitq;
2406
2407         for (;;) {
2408                 /* Network access will complete in finite time but the HUGE
2409                  * timeout lets us CWARN for visibility of sluggish NALs */
2410                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
2411                                            cfs_time_seconds(1), NULL, NULL);
2412                 rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
2413                                   &lwi);
2414                 if (rc == 0) {
2415                         ptlrpc_rqphase_move(request, request->rq_next_phase);
2416                         return 1;
2417                 }
2418
2419                 LASSERT(rc == -ETIMEDOUT);
2420                 DEBUG_REQ(D_WARNING, request,
2421                           "Unexpectedly long timeout rvcng=%d unlnk=%d/%d",
2422                           request->rq_receiving_reply,
2423                           request->rq_req_unlink, request->rq_reply_unlink);
2424         }
2425         return 0;
2426 }
2427 EXPORT_SYMBOL(ptlrpc_unregister_reply);
2428
2429 static void ptlrpc_free_request(struct ptlrpc_request *req)
2430 {
2431         spin_lock(&req->rq_lock);
2432         req->rq_replay = 0;
2433         spin_unlock(&req->rq_lock);
2434
2435         if (req->rq_commit_cb != NULL)
2436                 req->rq_commit_cb(req);
2437         list_del_init(&req->rq_replay_list);
2438
2439         __ptlrpc_req_finished(req, 1);
2440 }
2441
2442 /**
2443  * the request is committed and dropped from the replay list of its import
2444  */
2445 void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
2446 {
2447         struct obd_import       *imp = req->rq_import;
2448
2449         spin_lock(&imp->imp_lock);
2450         if (list_empty(&req->rq_replay_list)) {
2451                 spin_unlock(&imp->imp_lock);
2452                 return;
2453         }
2454
2455         if (force || req->rq_transno <= imp->imp_peer_committed_transno)
2456                 ptlrpc_free_request(req);
2457
2458         spin_unlock(&imp->imp_lock);
2459 }
2460 EXPORT_SYMBOL(ptlrpc_request_committed);
2461
2462 /**
2463  * Iterates through replay_list on import and prunes
2464  * all requests have transno smaller than last_committed for the
2465  * import and don't have rq_replay set.
2466  * Since requests are sorted in transno order, stops when meeting first
2467  * transno bigger than last_committed.
2468  * caller must hold imp->imp_lock
2469  */
2470 void ptlrpc_free_committed(struct obd_import *imp)
2471 {
2472         struct ptlrpc_request *req, *saved;
2473         struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
2474         bool                   skip_committed_list = true;
2475
2476         LASSERT(imp != NULL);
2477         assert_spin_locked(&imp->imp_lock);
2478
2479         if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
2480             imp->imp_generation == imp->imp_last_generation_checked) {
2481                 CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
2482                        imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
2483                 return;
2484         }
2485         CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
2486                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
2487                imp->imp_generation);
2488
2489         if (imp->imp_generation != imp->imp_last_generation_checked)
2490                 skip_committed_list = false;
2491
2492         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
2493         imp->imp_last_generation_checked = imp->imp_generation;
2494
2495         list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
2496                                  rq_replay_list) {
2497                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
2498                 LASSERT(req != last_req);
2499                 last_req = req;
2500
2501                 if (req->rq_transno == 0) {
2502                         DEBUG_REQ(D_EMERG, req, "zero transno during replay");
2503                         LBUG();
2504                 }
2505                 if (req->rq_import_generation < imp->imp_generation) {
2506                         DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
2507                         goto free_req;
2508                 }
2509
2510                 /* not yet committed */
2511                 if (req->rq_transno > imp->imp_peer_committed_transno) {
2512                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
2513                         break;
2514                 }
2515
2516                 if (req->rq_replay) {
2517                         DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
2518                         list_move_tail(&req->rq_replay_list,
2519                                        &imp->imp_committed_list);
2520                         continue;
2521                 }
2522
2523                 DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
2524                           imp->imp_peer_committed_transno);
2525 free_req:
2526                 ptlrpc_free_request(req);
2527         }
2528         if (skip_committed_list)
2529                 return;
2530
2531         list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
2532                                  rq_replay_list) {
2533                 LASSERT(req->rq_transno != 0);
2534                 if (req->rq_import_generation < imp->imp_generation) {
2535                         DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
2536                         ptlrpc_free_request(req);
2537                 }
2538         }
2539 }
2540
2541 void ptlrpc_cleanup_client(struct obd_import *imp)
2542 {
2543 }
2544 EXPORT_SYMBOL(ptlrpc_cleanup_client);
2545
2546 /**
2547  * Schedule previously sent request for resend.
2548  * For bulk requests we assign new xid (to avoid problems with
2549  * lost replies and therefore several transfers landing into same buffer
2550  * from different sending attempts).
2551  */
2552 void ptlrpc_resend_req(struct ptlrpc_request *req)
2553 {
2554         DEBUG_REQ(D_HA, req, "going to resend");
2555         spin_lock(&req->rq_lock);
2556
2557         /* Request got reply but linked to the import list still.
2558            Let ptlrpc_check_set() to process it. */
2559         if (ptlrpc_client_replied(req)) {
2560                 spin_unlock(&req->rq_lock);
2561                 DEBUG_REQ(D_HA, req, "it has reply, so skip it");
2562                 return;
2563         }
2564
2565         lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
2566         req->rq_status = -EAGAIN;
2567
2568         req->rq_resend = 1;
2569         req->rq_net_err = 0;
2570         req->rq_timedout = 0;
2571         if (req->rq_bulk) {
2572                 __u64 old_xid = req->rq_xid;
2573
2574                 /* ensure previous bulk fails */
2575                 req->rq_xid = ptlrpc_next_xid();
2576                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
2577                        old_xid, req->rq_xid);
2578         }
2579         ptlrpc_client_wake_req(req);
2580         spin_unlock(&req->rq_lock);
2581 }
2582 EXPORT_SYMBOL(ptlrpc_resend_req);
2583
2584 /* XXX: this function and rq_status are currently unused */
2585 void ptlrpc_restart_req(struct ptlrpc_request *req)
2586 {
2587         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
2588         req->rq_status = -ERESTARTSYS;
2589
2590         spin_lock(&req->rq_lock);
2591         req->rq_restart = 1;
2592         req->rq_timedout = 0;
2593         ptlrpc_client_wake_req(req);
2594         spin_unlock(&req->rq_lock);
2595 }
2596 EXPORT_SYMBOL(ptlrpc_restart_req);
2597
2598 /**
2599  * Grab additional reference on a request \a req
2600  */
2601 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
2602 {
2603         atomic_inc(&req->rq_refcount);
2604         return req;
2605 }
2606 EXPORT_SYMBOL(ptlrpc_request_addref);
2607
2608 /**
2609  * Add a request to import replay_list.
2610  * Must be called under imp_lock
2611  */
2612 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
2613                                       struct obd_import *imp)
2614 {
2615         struct list_head *tmp;
2616
2617         assert_spin_locked(&imp->imp_lock);
2618
2619         if (req->rq_transno == 0) {
2620                 DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
2621                 LBUG();
2622         }
2623
2624         /* clear this for new requests that were resent as well
2625            as resent replayed requests. */
2626         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
2627
2628         /* don't re-add requests that have been replayed */
2629         if (!list_empty(&req->rq_replay_list))
2630                 return;
2631
2632         lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
2633
2634         LASSERT(imp->imp_replayable);
2635         /* Balanced in ptlrpc_free_committed, usually. */
2636         ptlrpc_request_addref(req);
2637         list_for_each_prev(tmp, &imp->imp_replay_list) {
2638                 struct ptlrpc_request *iter =
2639                         list_entry(tmp, struct ptlrpc_request,
2640                                        rq_replay_list);
2641
2642                 /* We may have duplicate transnos if we create and then
2643                  * open a file, or for closes retained if to match creating
2644                  * opens, so use req->rq_xid as a secondary key.
2645                  * (See bugs 684, 685, and 428.)
2646                  * XXX no longer needed, but all opens need transnos!
2647                  */
2648                 if (iter->rq_transno > req->rq_transno)
2649                         continue;
2650
2651                 if (iter->rq_transno == req->rq_transno) {
2652                         LASSERT(iter->rq_xid != req->rq_xid);
2653                         if (iter->rq_xid > req->rq_xid)
2654                                 continue;
2655                 }
2656
2657                 list_add(&req->rq_replay_list, &iter->rq_replay_list);
2658                 return;
2659         }
2660
2661         list_add(&req->rq_replay_list, &imp->imp_replay_list);
2662 }
2663 EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
2664
2665 /**
2666  * Send request and wait until it completes.
2667  * Returns request processing status.
2668  */
2669 int ptlrpc_queue_wait(struct ptlrpc_request *req)
2670 {
2671         struct ptlrpc_request_set *set;
2672         int rc;
2673
2674         LASSERT(req->rq_set == NULL);
2675         LASSERT(!req->rq_receiving_reply);
2676
2677         set = ptlrpc_prep_set();
2678         if (set == NULL) {
2679                 CERROR("Unable to allocate ptlrpc set.");
2680                 return -ENOMEM;
2681         }
2682
2683         /* for distributed debugging */
2684         lustre_msg_set_status(req->rq_reqmsg, current_pid());
2685
2686         /* add a ref for the set (see comment in ptlrpc_set_add_req) */
2687         ptlrpc_request_addref(req);
2688         ptlrpc_set_add_req(set, req);
2689         rc = ptlrpc_set_wait(set);
2690         ptlrpc_set_destroy(set);
2691
2692         return rc;
2693 }
2694 EXPORT_SYMBOL(ptlrpc_queue_wait);
2695
2696 struct ptlrpc_replay_async_args {
2697         int praa_old_state;
2698         int praa_old_status;
2699 };
2700
2701 /**
2702  * Callback used for replayed requests reply processing.
2703  * In case of successful reply calls registered request replay callback.
2704  * In case of error restart replay process.
2705  */
2706 static int ptlrpc_replay_interpret(const struct lu_env *env,
2707                                    struct ptlrpc_request *req,
2708                                    void *data, int rc)
2709 {
2710         struct ptlrpc_replay_async_args *aa = data;
2711         struct obd_import *imp = req->rq_import;
2712
2713         atomic_dec(&imp->imp_replay_inflight);
2714
2715         if (!ptlrpc_client_replied(req)) {
2716                 CERROR("request replay timed out, restarting recovery\n");
2717                 rc = -ETIMEDOUT;
2718                 goto out;
2719         }
2720
2721         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
2722             (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
2723              lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) {
2724                 rc = lustre_msg_get_status(req->rq_repmsg);
2725                 goto out;
2726         }
2727
2728         /** VBR: check version failure */
2729         if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
2730                 /** replay was failed due to version mismatch */
2731                 DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
2732                 spin_lock(&imp->imp_lock);
2733                 imp->imp_vbr_failed = 1;
2734                 imp->imp_no_lock_replay = 1;
2735                 spin_unlock(&imp->imp_lock);
2736                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
2737         } else {
2738                 /** The transno had better not change over replay. */
2739                 LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
2740                          lustre_msg_get_transno(req->rq_repmsg) ||
2741                          lustre_msg_get_transno(req->rq_repmsg) == 0,
2742                          "%#llx/%#llx\n",
2743                          lustre_msg_get_transno(req->rq_reqmsg),
2744                          lustre_msg_get_transno(req->rq_repmsg));
2745         }
2746
2747         spin_lock(&imp->imp_lock);
2748         /** if replays by version then gap occur on server, no trust to locks */
2749         if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
2750                 imp->imp_no_lock_replay = 1;
2751         imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
2752         spin_unlock(&imp->imp_lock);
2753         LASSERT(imp->imp_last_replay_transno);
2754
2755         /* transaction number shouldn't be bigger than the latest replayed */
2756         if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
2757                 DEBUG_REQ(D_ERROR, req,
2758                           "Reported transno %llu is bigger than the replayed one: %llu",
2759                           req->rq_transno,
2760                           lustre_msg_get_transno(req->rq_reqmsg));
2761                 rc = -EINVAL;
2762                 goto out;
2763         }
2764
2765         DEBUG_REQ(D_HA, req, "got rep");
2766
2767         /* let the callback do fixups, possibly including in the request */
2768         if (req->rq_replay_cb)
2769                 req->rq_replay_cb(req);
2770
2771         if (ptlrpc_client_replied(req) &&
2772             lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
2773                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
2774                           lustre_msg_get_status(req->rq_repmsg),
2775                           aa->praa_old_status);
2776         } else {
2777                 /* Put it back for re-replay. */
2778                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
2779         }
2780
2781         /*
2782          * Errors while replay can set transno to 0, but
2783          * imp_last_replay_transno shouldn't be set to 0 anyway
2784          */
2785         if (req->rq_transno == 0)
2786                 CERROR("Transno is 0 during replay!\n");
2787
2788         /* continue with recovery */
2789         rc = ptlrpc_import_recovery_state_machine(imp);
2790  out:
2791         req->rq_send_state = aa->praa_old_state;
2792
2793         if (rc != 0)
2794                 /* this replay failed, so restart recovery */
2795                 ptlrpc_connect_import(imp);
2796
2797         return rc;
2798 }
2799
2800 /**
2801  * Prepares and queues request for replay.
2802  * Adds it to ptlrpcd queue for actual sending.
2803  * Returns 0 on success.
2804  */
2805 int ptlrpc_replay_req(struct ptlrpc_request *req)
2806 {
2807         struct ptlrpc_replay_async_args *aa;
2808
2809         LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
2810
2811         LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2812         aa = ptlrpc_req_async_args(req);
2813         memset(aa, 0, sizeof(*aa));
2814
2815         /* Prepare request to be resent with ptlrpcd */
2816         aa->praa_old_state = req->rq_send_state;
2817         req->rq_send_state = LUSTRE_IMP_REPLAY;
2818         req->rq_phase = RQ_PHASE_NEW;
2819         req->rq_next_phase = RQ_PHASE_UNDEFINED;
2820         if (req->rq_repmsg)
2821                 aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
2822         req->rq_status = 0;
2823         req->rq_interpret_reply = ptlrpc_replay_interpret;
2824         /* Readjust the timeout for current conditions */
2825         ptlrpc_at_set_req_timeout(req);
2826
2827         /* Tell server the net_latency, so the server can calculate how long
2828          * it should wait for next replay */
2829         lustre_msg_set_service_time(req->rq_reqmsg,
2830                                     ptlrpc_at_get_net_latency(req));
2831         DEBUG_REQ(D_HA, req, "REPLAY");
2832
2833         atomic_inc(&req->rq_import->imp_replay_inflight);
2834         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
2835
2836         ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
2837         return 0;
2838 }
2839 EXPORT_SYMBOL(ptlrpc_replay_req);
2840
2841 /**
2842  * Aborts all in-flight request on import \a imp sending and delayed lists
2843  */
2844 void ptlrpc_abort_inflight(struct obd_import *imp)
2845 {
2846         struct list_head *tmp, *n;
2847
2848         /* Make sure that no new requests get processed for this import.
2849          * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
2850          * this flag and then putting requests on sending_list or delayed_list.
2851          */
2852         spin_lock(&imp->imp_lock);
2853
2854         /* XXX locking?  Maybe we should remove each request with the list
2855          * locked?  Also, how do we know if the requests on the list are
2856          * being freed at this time?
2857          */
2858         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
2859                 struct ptlrpc_request *req =
2860                         list_entry(tmp, struct ptlrpc_request, rq_list);
2861
2862                 DEBUG_REQ(D_RPCTRACE, req, "inflight");
2863
2864                 spin_lock(&req->rq_lock);
2865                 if (req->rq_import_generation < imp->imp_generation) {
2866                         req->rq_err = 1;
2867                         req->rq_status = -EIO;
2868                         ptlrpc_client_wake_req(req);
2869                 }
2870                 spin_unlock(&req->rq_lock);
2871         }
2872
2873         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
2874                 struct ptlrpc_request *req =
2875                         list_entry(tmp, struct ptlrpc_request, rq_list);
2876
2877                 DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
2878
2879                 spin_lock(&req->rq_lock);
2880                 if (req->rq_import_generation < imp->imp_generation) {
2881                         req->rq_err = 1;
2882                         req->rq_status = -EIO;
2883                         ptlrpc_client_wake_req(req);
2884                 }
2885                 spin_unlock(&req->rq_lock);
2886         }
2887
2888         /* Last chance to free reqs left on the replay list, but we
2889          * will still leak reqs that haven't committed.  */
2890         if (imp->imp_replayable)
2891                 ptlrpc_free_committed(imp);
2892
2893         spin_unlock(&imp->imp_lock);
2894 }
2895 EXPORT_SYMBOL(ptlrpc_abort_inflight);
2896
2897 /**
2898  * Abort all uncompleted requests in request set \a set
2899  */
2900 void ptlrpc_abort_set(struct ptlrpc_request_set *set)
2901 {
2902         struct list_head *tmp, *pos;
2903
2904         LASSERT(set != NULL);
2905
2906         list_for_each_safe(pos, tmp, &set->set_requests) {
2907                 struct ptlrpc_request *req =
2908                         list_entry(pos, struct ptlrpc_request,
2909                                        rq_set_chain);
2910
2911                 spin_lock(&req->rq_lock);
2912                 if (req->rq_phase != RQ_PHASE_RPC) {
2913                         spin_unlock(&req->rq_lock);
2914                         continue;
2915                 }
2916
2917                 req->rq_err = 1;
2918                 req->rq_status = -EINTR;
2919                 ptlrpc_client_wake_req(req);
2920                 spin_unlock(&req->rq_lock);
2921         }
2922 }
2923
2924 static __u64 ptlrpc_last_xid;
2925 static spinlock_t ptlrpc_last_xid_lock;
2926
2927 /**
2928  * Initialize the XID for the node.  This is common among all requests on
2929  * this node, and only requires the property that it is monotonically
2930  * increasing.  It does not need to be sequential.  Since this is also used
2931  * as the RDMA match bits, it is important that a single client NOT have
2932  * the same match bits for two different in-flight requests, hence we do
2933  * NOT want to have an XID per target or similar.
2934  *
2935  * To avoid an unlikely collision between match bits after a client reboot
2936  * (which would deliver old data into the wrong RDMA buffer) initialize
2937  * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
2938  * If the time is clearly incorrect, we instead use a 62-bit random number.
2939  * In the worst case the random number will overflow 1M RPCs per second in
2940  * 9133 years, or permutations thereof.
2941  */
2942 #define YEAR_2004 (1ULL << 30)
2943 void ptlrpc_init_xid(void)
2944 {
2945         time_t now = get_seconds();
2946
2947         spin_lock_init(&ptlrpc_last_xid_lock);
2948         if (now < YEAR_2004) {
2949                 cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
2950                 ptlrpc_last_xid >>= 2;
2951                 ptlrpc_last_xid |= (1ULL << 61);
2952         } else {
2953                 ptlrpc_last_xid = (__u64)now << 20;
2954         }
2955
2956         /* Always need to be aligned to a power-of-two for multi-bulk BRW */
2957         CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
2958         ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
2959 }
2960
2961 /**
2962  * Increase xid and returns resulting new value to the caller.
2963  *
2964  * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
2965  * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
2966  * itself uses the last bulk xid needed, so the server can determine the
2967  * the number of bulk transfers from the RPC XID and a bitmask.  The starting
2968  * xid must align to a power-of-two value.
2969  *
2970  * This is assumed to be true due to the initial ptlrpc_last_xid
2971  * value also being initialized to a power-of-two value. LU-1431
2972  */
2973 __u64 ptlrpc_next_xid(void)
2974 {
2975         __u64 next;
2976
2977         spin_lock(&ptlrpc_last_xid_lock);
2978         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
2979         ptlrpc_last_xid = next;
2980         spin_unlock(&ptlrpc_last_xid_lock);
2981
2982         return next;
2983 }
2984 EXPORT_SYMBOL(ptlrpc_next_xid);
2985
2986 /**
2987  * Get a glimpse at what next xid value might have been.
2988  * Returns possible next xid.
2989  */
2990 __u64 ptlrpc_sample_next_xid(void)
2991 {
2992 #if BITS_PER_LONG == 32
2993         /* need to avoid possible word tearing on 32-bit systems */
2994         __u64 next;
2995
2996         spin_lock(&ptlrpc_last_xid_lock);
2997         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
2998         spin_unlock(&ptlrpc_last_xid_lock);
2999
3000         return next;
3001 #else
3002         /* No need to lock, since returned value is racy anyways */
3003         return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3004 #endif
3005 }
3006 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
3007
3008 /**
3009  * Functions for operating ptlrpc workers.
3010  *
3011  * A ptlrpc work is a function which will be running inside ptlrpc context.
3012  * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
3013  *
3014  * 1. after a work is created, it can be used many times, that is:
3015  *       handler = ptlrpcd_alloc_work();
3016  *       ptlrpcd_queue_work();
3017  *
3018  *    queue it again when necessary:
3019  *       ptlrpcd_queue_work();
3020  *       ptlrpcd_destroy_work();
3021  * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
3022  *    it will only be queued once in any time. Also as its name implies, it may
3023  *    have delay before it really runs by ptlrpcd thread.
3024  */
3025 struct ptlrpc_work_async_args {
3026         int   (*cb)(const struct lu_env *, void *);
3027         void   *cbdata;
3028 };
3029
3030 static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
3031 {
3032         /* re-initialize the req */
3033         req->rq_timeout         = obd_timeout;
3034         req->rq_sent            = get_seconds();
3035         req->rq_deadline        = req->rq_sent + req->rq_timeout;
3036         req->rq_reply_deadline  = req->rq_deadline;
3037         req->rq_phase           = RQ_PHASE_INTERPRET;
3038         req->rq_next_phase      = RQ_PHASE_COMPLETE;
3039         req->rq_xid             = ptlrpc_next_xid();
3040         req->rq_import_generation = req->rq_import->imp_generation;
3041
3042         ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
3043 }
3044
3045 static int work_interpreter(const struct lu_env *env,
3046                             struct ptlrpc_request *req, void *data, int rc)
3047 {
3048         struct ptlrpc_work_async_args *arg = data;
3049
3050         LASSERT(ptlrpcd_check_work(req));
3051         LASSERT(arg->cb != NULL);
3052
3053         rc = arg->cb(env, arg->cbdata);
3054
3055         list_del_init(&req->rq_set_chain);
3056         req->rq_set = NULL;
3057
3058         if (atomic_dec_return(&req->rq_refcount) > 1) {
3059                 atomic_set(&req->rq_refcount, 2);
3060                 ptlrpcd_add_work_req(req);
3061         }
3062         return rc;
3063 }
3064
3065 static int worker_format;
3066
3067 static int ptlrpcd_check_work(struct ptlrpc_request *req)
3068 {
3069         return req->rq_pill.rc_fmt == (void *)&worker_format;
3070 }
3071
3072 /**
3073  * Create a work for ptlrpc.
3074  */
3075 void *ptlrpcd_alloc_work(struct obd_import *imp,
3076                          int (*cb)(const struct lu_env *, void *), void *cbdata)
3077 {
3078         struct ptlrpc_request    *req = NULL;
3079         struct ptlrpc_work_async_args *args;
3080
3081         might_sleep();
3082
3083         if (cb == NULL)
3084                 return ERR_PTR(-EINVAL);
3085
3086         /* copy some code from deprecated fakereq. */
3087         req = ptlrpc_request_cache_alloc(GFP_NOFS);
3088         if (req == NULL) {
3089                 CERROR("ptlrpc: run out of memory!\n");
3090                 return ERR_PTR(-ENOMEM);
3091         }
3092
3093         req->rq_send_state = LUSTRE_IMP_FULL;
3094         req->rq_type = PTL_RPC_MSG_REQUEST;
3095         req->rq_import = class_import_get(imp);
3096         req->rq_export = NULL;
3097         req->rq_interpret_reply = work_interpreter;
3098         /* don't want reply */
3099         req->rq_receiving_reply = 0;
3100         req->rq_req_unlink = req->rq_reply_unlink = 0;
3101         req->rq_no_delay = req->rq_no_resend = 1;
3102         req->rq_pill.rc_fmt = (void *)&worker_format;
3103
3104         spin_lock_init(&req->rq_lock);
3105         INIT_LIST_HEAD(&req->rq_list);
3106         INIT_LIST_HEAD(&req->rq_replay_list);
3107         INIT_LIST_HEAD(&req->rq_set_chain);
3108         INIT_LIST_HEAD(&req->rq_history_list);
3109         INIT_LIST_HEAD(&req->rq_exp_list);
3110         init_waitqueue_head(&req->rq_reply_waitq);
3111         init_waitqueue_head(&req->rq_set_waitq);
3112         atomic_set(&req->rq_refcount, 1);
3113
3114         CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
3115         args = ptlrpc_req_async_args(req);
3116         args->cb     = cb;
3117         args->cbdata = cbdata;
3118
3119         return req;
3120 }
3121 EXPORT_SYMBOL(ptlrpcd_alloc_work);
3122
3123 void ptlrpcd_destroy_work(void *handler)
3124 {
3125         struct ptlrpc_request *req = handler;
3126
3127         if (req)
3128                 ptlrpc_req_finished(req);
3129 }
3130 EXPORT_SYMBOL(ptlrpcd_destroy_work);
3131
3132 int ptlrpcd_queue_work(void *handler)
3133 {
3134         struct ptlrpc_request *req = handler;
3135
3136         /*
3137          * Check if the req is already being queued.
3138          *
3139          * Here comes a trick: it lacks a way of checking if a req is being
3140          * processed reliably in ptlrpc. Here I have to use refcount of req
3141          * for this purpose. This is okay because the caller should use this
3142          * req as opaque data. - Jinshan
3143          */
3144         LASSERT(atomic_read(&req->rq_refcount) > 0);
3145         if (atomic_inc_return(&req->rq_refcount) == 2)
3146                 ptlrpcd_add_work_req(req);
3147         return 0;
3148 }
3149 EXPORT_SYMBOL(ptlrpcd_queue_work);