^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * licenses. You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <net/sock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/in.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/time.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/rds.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include "rds.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) struct in6_addr *saddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) refcount_set(&inc->i_refcount, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) INIT_LIST_HEAD(&inc->i_item);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) inc->i_conn = conn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) inc->i_saddr = *saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) inc->i_usercopy.rdma_cookie = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) EXPORT_SYMBOL_GPL(rds_inc_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) struct in6_addr *saddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) refcount_set(&inc->i_refcount, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) INIT_LIST_HEAD(&inc->i_item);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) inc->i_conn = cp->cp_conn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) inc->i_conn_path = cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) inc->i_saddr = *saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) inc->i_usercopy.rdma_cookie = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) EXPORT_SYMBOL_GPL(rds_inc_path_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static void rds_inc_addref(struct rds_incoming *inc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) refcount_inc(&inc->i_refcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) void rds_inc_put(struct rds_incoming *inc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) if (refcount_dec_and_test(&inc->i_refcount)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) BUG_ON(!list_empty(&inc->i_item));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) inc->i_conn->c_trans->inc_free(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) EXPORT_SYMBOL_GPL(rds_inc_put);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) struct rds_cong_map *map,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) int delta, __be16 port)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) int now_congested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) if (delta == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) rs->rs_rcv_bytes += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) if (delta > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) rds_stats_add(s_recv_bytes_added_to_socket, delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) /* loop transport doesn't send/recv congestion updates */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) "now_cong %d delta %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) rs, &rs->rs_bound_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) rds_sk_rcvbuf(rs), now_congested, delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /* wasn't -> am congested */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) if (!rs->rs_congested && now_congested) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) rs->rs_congested = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) rds_cong_set_bit(map, port);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) rds_cong_queue_updates(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) /* was -> aren't congested */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) /* Require more free space before reporting uncongested to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) bouncing cong/uncong state too often */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) rs->rs_congested = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) rds_cong_clear_bit(map, port);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) rds_cong_queue_updates(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /* do nothing if no change in cong state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) static void rds_conn_peer_gen_update(struct rds_connection *conn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) u32 peer_gen_num)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) struct rds_message *rm, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (peer_gen_num != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) if (conn->c_peer_gen_num != 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) peer_gen_num != conn->c_peer_gen_num) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) for (i = 0; i < RDS_MPATH_WORKERS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) struct rds_conn_path *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) cp = &conn->c_path[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) spin_lock_irqsave(&cp->cp_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) cp->cp_next_tx_seq = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) cp->cp_next_rx_seq = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) list_for_each_entry_safe(rm, tmp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) &cp->cp_retrans,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) m_conn_item) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) set_bit(RDS_MSG_FLUSH, &rm->m_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) spin_unlock_irqrestore(&cp->cp_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) conn->c_peer_gen_num = peer_gen_num;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) * Process all extension headers that come with this message.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) struct rds_header *hdr = &inc->i_hdr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) unsigned int pos = 0, type, len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) struct rds_ext_header_version version;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) struct rds_ext_header_rdma rdma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) struct rds_ext_header_rdma_dest rdma_dest;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) } buffer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) len = sizeof(buffer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) type = rds_message_next_extension(hdr, &pos, &buffer, &len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) if (type == RDS_EXTHDR_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) /* Process extension header here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) case RDS_EXTHDR_RDMA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) case RDS_EXTHDR_RDMA_DEST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) /* We ignore the size for now. We could stash it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * somewhere and use it for error checking. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) static void rds_recv_hs_exthdrs(struct rds_header *hdr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) struct rds_connection *conn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) unsigned int pos = 0, type, len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) struct rds_ext_header_version version;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) u16 rds_npaths;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) u32 rds_gen_num;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) } buffer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) u32 new_peer_gen_num = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) len = sizeof(buffer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) type = rds_message_next_extension(hdr, &pos, &buffer, &len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) if (type == RDS_EXTHDR_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /* Process extension header here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) case RDS_EXTHDR_NPATHS:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) be16_to_cpu(buffer.rds_npaths));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) case RDS_EXTHDR_GEN_NUM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) pr_warn_ratelimited("ignoring unknown exthdr type "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) "0x%x\n", type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) conn->c_npaths = max_t(int, conn->c_npaths, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) conn->c_ping_triggered = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) rds_conn_peer_gen_update(conn, new_peer_gen_num);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) /* rds_start_mprds() will synchronously start multiple paths when appropriate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) * The scheme is based on the following rules:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) * sender's npaths (s_npaths)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) * sends back a probe-pong with r_npaths. After that, if rcvr is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) * smaller ip addr, it starts rds_conn_path_connect_if_down on all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * mprds_paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * called after reception of the probe-pong on all mprds_paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * Otherwise (sender of probe-ping is not the smaller ip addr): just call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * rds_conn_path_connect_if_down on the hashed path. (see rule 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * 5. sender may end up queuing the packet on the cp. will get sent out later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * when connection is completed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) static void rds_start_mprds(struct rds_connection *conn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) struct rds_conn_path *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) if (conn->c_npaths > 1 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) for (i = 0; i < conn->c_npaths; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) cp = &conn->c_path[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) rds_conn_path_connect_if_down(cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * The transport must make sure that this is serialized against other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * rx and conn reset on this specific conn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * We currently assert that only one fragmented message will be sent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) * down a connection at a time. This lets us reassemble in the conn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) * instead of per-flow which means that we don't have to go digging through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * flows to tear down partial reassembly progress on conn failure and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * we save flow lookup and locking for each frag arrival. It does mean
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * that small messages will wait behind large ones. Fragmenting at all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) * is only to reduce the memory consumption of pre-posted buffers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * The caller passes in saddr and daddr instead of us getting it from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * conn. This lets loopback, who only has one conn for both directions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) * tell us which roles the addrs in the conn are playing for this message.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) struct in6_addr *daddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) struct rds_incoming *inc, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) struct rds_sock *rs = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) struct rds_conn_path *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) inc->i_conn = conn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) inc->i_rx_jiffies = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) if (conn->c_trans->t_mp_capable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) cp = inc->i_conn_path;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) cp = &conn->c_path[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) "flags 0x%x rx_jiffies %lu\n", conn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) (unsigned long long)cp->cp_next_rx_seq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) be32_to_cpu(inc->i_hdr.h_len),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) be16_to_cpu(inc->i_hdr.h_sport),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) be16_to_cpu(inc->i_hdr.h_dport),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) inc->i_hdr.h_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) inc->i_rx_jiffies);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) * Sequence numbers should only increase. Messages get their
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) * sequence number as they're queued in a sending conn. They
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) * can be dropped, though, if the sending socket is closed before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * they hit the wire. So sequence numbers can skip forward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) * under normal operation. They can also drop back in the conn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) * failover case as previously sent messages are resent down the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) * new instance of a conn. We drop those, otherwise we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) * to assume that the next valid seq does not come after a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * hole in the fragment stream.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * The headers don't give us a way to realize if fragments of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * a message have been dropped. We assume that frags that arrive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * to a flow are part of the current message on the flow that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) * being reassembled. This means that senders can't drop messages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) * from the sending conn until all their frags are sent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * XXX we could spend more on the wire to get more robust failure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * detection, arguably worth it to avoid data corruption.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) rds_stats_inc(s_recv_drop_old_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (inc->i_hdr.h_sport == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) rdsdebug("ignore ping with 0 sport from %pI6c\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) saddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) rds_stats_inc(s_recv_ping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) rds_send_pong(cp, inc->i_hdr.h_sport);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) /* if this is a handshake ping, start multipath if necessary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) be16_to_cpu(inc->i_hdr.h_dport))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) rds_start_mprds(cp->cp_conn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) inc->i_hdr.h_sport == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) /* if this is a handshake pong, start multipath if necessary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) rds_start_mprds(cp->cp_conn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) wake_up(&cp->cp_conn->c_hs_waitq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) if (!rs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) rds_stats_inc(s_recv_drop_no_sock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) /* Process extension headers */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) rds_recv_incoming_exthdrs(inc, rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) /* We can be racing with rds_release() which marks the socket dead. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) sk = rds_rs_to_sk(rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) /* serialize with rds_release -> sock_orphan */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) write_lock_irqsave(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) if (!sock_flag(sk, SOCK_DEAD)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) rds_stats_inc(s_recv_queued);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) be32_to_cpu(inc->i_hdr.h_len),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) inc->i_hdr.h_dport);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) if (sock_flag(sk, SOCK_RCVTSTAMP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) inc->i_usercopy.rx_tstamp = ktime_get_real();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) rds_inc_addref(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) list_add_tail(&inc->i_item, &rs->rs_recv_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) __rds_wake_sk_sleep(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) rds_stats_inc(s_recv_drop_dead_sock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) write_unlock_irqrestore(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) if (rs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) rds_sock_put(rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) EXPORT_SYMBOL_GPL(rds_recv_incoming);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * be very careful here. This is being called as the condition in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * wait_event_*() needs to cope with being called many times.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) if (!*inc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) read_lock_irqsave(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) if (!list_empty(&rs->rs_recv_queue)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) *inc = list_entry(rs->rs_recv_queue.next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) struct rds_incoming,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) i_item);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) rds_inc_addref(*inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) read_unlock_irqrestore(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) return *inc != NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) int drop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) struct sock *sk = rds_rs_to_sk(rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) write_lock_irqsave(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) if (!list_empty(&inc->i_item)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (drop) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) /* XXX make sure this i_conn is reliable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) -be32_to_cpu(inc->i_hdr.h_len),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) inc->i_hdr.h_dport);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) list_del_init(&inc->i_item);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) rds_inc_put(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) write_unlock_irqrestore(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) * Pull errors off the error queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * If msghdr is NULL, we will just purge the error queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) struct rds_notifier *notifier;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) struct rds_rdma_notify cmsg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) unsigned int count = 0, max_messages = ~0U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) LIST_HEAD(copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) int err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) /* put_cmsg copies to user space and thus may sleep. We can't do this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) * with rs_lock held, so first grab as many notifications as we can stuff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * in the user provided cmsg buffer. We don't try to copy more, to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * losing notifications - except when the buffer is so small that it wouldn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * even hold a single notification. Then we give him as much of this single
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * msg as we can squeeze in, and set MSG_CTRUNC.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) if (msghdr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) if (!max_messages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) max_messages = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) spin_lock_irqsave(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) notifier = list_entry(rs->rs_notify_queue.next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct rds_notifier, n_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) list_move(¬ifier->n_list, ©);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) spin_unlock_irqrestore(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) if (!count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) while (!list_empty(©)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) notifier = list_entry(copy.next, struct rds_notifier, n_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) if (msghdr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) cmsg.user_token = notifier->n_user_token;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) cmsg.status = notifier->n_status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) sizeof(cmsg), &cmsg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) list_del_init(¬ifier->n_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) kfree(notifier);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) /* If we bailed out because of an error in put_cmsg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) * we may be left with one or more notifications that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) * didn't process. Return them to the head of the list. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) if (!list_empty(©)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) spin_lock_irqsave(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) list_splice(©, &rs->rs_notify_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) spin_unlock_irqrestore(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * Queue a congestion notification
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) uint64_t notify = rs->rs_cong_notify;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) sizeof(notify), ¬ify);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) spin_lock_irqsave(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) rs->rs_cong_notify &= ~notify;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) spin_unlock_irqrestore(&rs->rs_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * Receive any control messages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) struct rds_sock *rs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) if (inc->i_usercopy.rdma_cookie) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) sizeof(inc->i_usercopy.rdma_cookie),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) &inc->i_usercopy.rdma_cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) if ((inc->i_usercopy.rx_tstamp != 0) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) struct __kernel_old_timeval tv =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) sizeof(tv), &tv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) struct __kernel_sock_timeval sk_tv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) sk_tv.tv_sec = tv.tv_sec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) sk_tv.tv_usec = tv.tv_usec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) sizeof(sk_tv), &sk_tv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) if (rs->rs_rx_traces) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) struct rds_cmsg_rx_trace t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) int i, j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) memset(&t, 0, sizeof(t));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) t.rx_traces = rs->rs_rx_traces;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) for (i = 0; i < rs->rs_rx_traces; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) j = rs->rs_rx_trace[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) t.rx_trace_pos[i] = j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) inc->i_rx_lat_trace[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) sizeof(t), &t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) struct rds_msg_zcopy_info *info = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) struct rds_zcopy_cookies *done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) if (!msg->msg_control)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) spin_lock_irqsave(&q->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) if (!list_empty(&q->zcookie_head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) info = list_entry(q->zcookie_head.next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) struct rds_msg_zcopy_info, rs_zcookie_next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) list_del(&info->rs_zcookie_next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) spin_unlock_irqrestore(&q->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) if (!info)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) done = &info->zcookies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) done)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) spin_lock_irqsave(&q->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) list_add(&info->rs_zcookie_next, &q->zcookie_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) spin_unlock_irqrestore(&q->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) kfree(info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) int msg_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) struct sock *sk = sock->sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) struct rds_sock *rs = rds_sk_to_rs(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) long timeo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) struct rds_incoming *inc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) timeo = sock_rcvtimeo(sk, nonblock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) if (msg_flags & MSG_OOB)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) if (msg_flags & MSG_ERRQUEUE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) /* If there are pending notifications, do those - and nothing else */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) if (!list_empty(&rs->rs_notify_queue)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) ret = rds_notify_queue_get(rs, msg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) if (rs->rs_cong_notify) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) ret = rds_notify_cong(rs, msg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) if (!rds_next_incoming(rs, &inc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) if (nonblock) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) bool reaped = rds_recvmsg_zcookie(rs, msg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) ret = reaped ? 0 : -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) (!list_empty(&rs->rs_notify_queue) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) rs->rs_cong_notify ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) rds_next_incoming(rs, &inc)), timeo);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) timeo);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) ret = timeo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) ret = -ETIMEDOUT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) &inc->i_conn->c_faddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) ntohs(inc->i_hdr.h_sport));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) * if the message we just copied isn't at the head of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * recv queue then someone else raced us to return it, try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * to get the next message.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) rds_inc_put(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) inc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) rds_stats_inc(s_recv_deliver_raced);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) iov_iter_revert(&msg->msg_iter, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) if (msg_flags & MSG_TRUNC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) ret = be32_to_cpu(inc->i_hdr.h_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) msg->msg_flags |= MSG_TRUNC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) if (rds_cmsg_recv(inc, msg, rs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) rds_recvmsg_zcookie(rs, msg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) rds_stats_inc(s_recv_delivered);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) if (msg->msg_name) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) if (ipv6_addr_v4mapped(&inc->i_saddr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) sin = (struct sockaddr_in *)msg->msg_name;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) sin->sin_family = AF_INET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) sin->sin_port = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) sin->sin_addr.s_addr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) inc->i_saddr.s6_addr32[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) msg->msg_namelen = sizeof(*sin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) sin6 = (struct sockaddr_in6 *)msg->msg_name;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) sin6->sin6_family = AF_INET6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) sin6->sin6_port = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) sin6->sin6_addr = inc->i_saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) sin6->sin6_flowinfo = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) sin6->sin6_scope_id = rs->rs_bound_scope_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) msg->msg_namelen = sizeof(*sin6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) if (inc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) rds_inc_put(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) * The socket is being shut down and we're asked to drop messages that were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) * queued for recvmsg. The caller has unbound the socket so the receive path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) * won't queue any more incoming fragments or messages on the socket.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) void rds_clear_recv_queue(struct rds_sock *rs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) struct sock *sk = rds_rs_to_sk(rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) struct rds_incoming *inc, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) write_lock_irqsave(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) -be32_to_cpu(inc->i_hdr.h_len),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) inc->i_hdr.h_dport);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) list_del_init(&inc->i_item);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) rds_inc_put(inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) write_unlock_irqrestore(&rs->rs_recv_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * inc->i_saddr isn't used here because it is only set in the receive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) * path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) void rds_inc_info_copy(struct rds_incoming *inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) struct rds_info_iterator *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) __be32 saddr, __be32 daddr, int flip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) struct rds_info_message minfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) minfo.len = be32_to_cpu(inc->i_hdr.h_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) minfo.tos = inc->i_conn->c_tos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) if (flip) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) minfo.laddr = daddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) minfo.faddr = saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) minfo.lport = inc->i_hdr.h_dport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) minfo.fport = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) minfo.laddr = saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) minfo.faddr = daddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) minfo.lport = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) minfo.fport = inc->i_hdr.h_dport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) minfo.flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) rds_info_copy(iter, &minfo, sizeof(minfo));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) #if IS_ENABLED(CONFIG_IPV6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) void rds6_inc_info_copy(struct rds_incoming *inc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) struct rds_info_iterator *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) struct in6_addr *saddr, struct in6_addr *daddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) int flip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) struct rds6_info_message minfo6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) minfo6.tos = inc->i_conn->c_tos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) if (flip) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) minfo6.laddr = *daddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) minfo6.faddr = *saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) minfo6.lport = inc->i_hdr.h_dport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) minfo6.fport = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) minfo6.laddr = *saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) minfo6.faddr = *daddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) minfo6.lport = inc->i_hdr.h_sport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) minfo6.fport = inc->i_hdr.h_dport;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) minfo6.flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) rds_info_copy(iter, &minfo6, sizeof(minfo6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) #endif