#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
#include <linux/filter.h>
}
}
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+ switch (sk->sk_family) {
+ case AF_UNSPEC:
+ case AF_UNIX:
+ return false;
+ default:
+ return true;
+ }
+}
static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
if (sk->sk_flags & flags) {
sk->sk_flags &= ~flags;
- if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ if (sock_needs_netstamp(sk) &&
+ !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
}
}
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
if (sk->sk_state != TCP_ESTABLISHED) {
ret = -EINVAL;
break;
sk->sk_max_pacing_rate);
break;
+ case SO_INCOMING_CPU:
+ sk->sk_incoming_cpu = val;
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
- struct proto *prot)
+ struct proto *prot, int kern)
{
struct sock *sk;
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
- sock_net_set(sk, get_net(net));
+ sk->sk_net_refcnt = kern ? 0 : 1;
+ if (likely(sk->sk_net_refcnt))
+ get_net(net);
+ sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
}
EXPORT_SYMBOL(sk_alloc);
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
{
struct sk_filter *filter;
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- put_net(sock_net(sk));
+ if (likely(sk->sk_net_refcnt))
+ put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
}
+static void __sk_free(struct sock *sk)
+{
+ if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+ sock_diag_broadcast_destroy(sk);
+ else
+ sk_destruct(sk);
+}
+
void sk_free(struct sock *sk)
{
/*
}
EXPORT_SYMBOL(sk_free);
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
- if (sk == NULL || sk->sk_socket == NULL)
- return;
-
- sock_hold(sk);
- sock_release(sk->sk_socket);
- sock_net_set(sk, get_net(&init_net));
- sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
static void sk_update_clone(const struct sock *sk, struct sock *newsk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
sock_copy(newsk, sk);
/* SANITY */
- get_net(sock_net(newsk));
+ if (likely(newsk->sk_net_refcnt))
+ get_net(sock_net(newsk));
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
- spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
lockdep_set_class_and_name(&newsk->sk_callback_lock,
af_callback_keys + newsk->sk_family,
*/
is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
- if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ if (sock_needs_netstamp(sk) &&
+ newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
- __sk_dst_set(sk, dst);
+ u32 max_segs = 1;
+
+ sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
- sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
}
}
+ sk->sk_gso_max_segs = max_segs;
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
}
EXPORT_SYMBOL(sock_wfree);
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+#ifdef CONFIG_INET
+ if (unlikely(!sk_fullsock(sk))) {
+ skb->destructor = sock_edemux;
+ sock_hold(sk);
+ return;
+ }
+#endif
+ skb->destructor = sock_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ /*
+ * We used to take a refcount on sk, but following operation
+ * is enough to guarantee sk_free() wont free this sock until
+ * all in-flight packets are completed
+ */
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_w);
+
void skb_orphan_partial(struct sk_buff *skb)
{
/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
{
DEFINE_WAIT(wait);
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
for (;;) {
if (!timeo)
break;
if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
break;
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ struct sockcm_cookie *sockc)
+{
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
pfrag->offset = 0;
if (SKB_FRAG_PAGE_ORDER) {
- pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY,
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
* @timeo: for how long
+ * @skb: last skb seen on sk_receive_queue
*
* Now socket state including sk->sk_err is changed only under lock,
* hence we may omit checks after joining wait queue.
* We check receive queue before schedule() only as optimization;
* it is very likely that release_sock() added new data.
*/
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
int rc;
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
return rc;
}
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_reclaim - reclaim memory_allocated
+ * __sk_mem_reclaim - reclaim memory_allocated
* @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
*/
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk,
- sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
- sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk_memory_allocated_sub(sk, amount);
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
static void sock_def_destruct(struct sock *sk)
{
- kfree(sk->sk_protinfo);
}
void sk_send_sigurg(struct sock *sk)
} else
sk->sk_wq = NULL;
- spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
sk->sk_max_pacing_rate = ~0U;
sk->sk_pacing_rate = ~0U;
+ sk->sk_incoming_cpu = -1;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+ if (sock_needs_netstamp(sk) &&
+ !(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
return;
kfree(rsk_prot->slab_name);
rsk_prot->slab_name = NULL;
- if (rsk_prot->slab) {
- kmem_cache_destroy(rsk_prot->slab);
- rsk_prot->slab = NULL;
- }
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
}
static int req_prot_init(const struct proto *prot)
rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
rsk_prot->obj_size, 0,
- 0, NULL);
+ prot->slab_flags, NULL);
if (!rsk_prot->slab) {
pr_crit("%s: Can't create request sock SLAB cache!\n",
list_del(&prot->node);
mutex_unlock(&proto_list_mutex);
- if (prot->slab != NULL) {
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
- }
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);